Getting rid of one more non-auto View, comms overlap in Laplace operator

Fixing Laplace flopcount Minor cleanup
Laplace benchmark added
2026-02-10 08:53:27 +00:00 · 2024-02-25 22:37:48 -05:00 · 2024-02-13 12:06:08 -05:00 · 2024-02-12 21:23:36 -05:00 · 2024-02-12 21:10:21 -05:00 · 2024-02-08 17:13:10 -05:00
109 changed files with 2997 additions and 6425 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,3 @@
 # Doxygen stuff
 html/*
 latex/*
 # Compiled Object files #
 #########################
 *.slo
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,7 +34,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-#undef EIGEN_USE_SYCL
+//#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -460,6 +460,53 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
  }
 };
 template<class Matrix,class Field>
 class QuadLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
  RealD a0,a1,a2;
  QuadLinearOperator(Matrix &Mat): _Mat(Mat),a0(0.),a1(0.),a2(1.) {};
  QuadLinearOperator(Matrix &Mat, RealD _a0,RealD _a1,RealD _a2): _Mat(Mat),a0(_a0),a1(_a1),a2(_a2) {};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    assert(0);
    _Mat.Mdiag(in,out);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    assert(0);
    _Mat.Mdir(in,out,dir,disp);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
    assert(0);
    _Mat.MdirAll(in,out);
  }
  void HermOp (const Field &in, Field &out){
 //    _Mat.M(in,out);
    Field tmp1(in.Grid());
 //    Linop.HermOpAndNorm(psi, mmp, d, b);
    _Mat.M(in,tmp1);
    _Mat.M(tmp1,out);
    out *= a2;
    axpy(out, a1, tmp1, out);
    axpy(out, a0, in, out);
 //    d=real(innerProduct(psi,mmp));
 //    b=norm2(mmp);
  }
  void AdjOp     (const Field &in, Field &out){
    assert(0);
    _Mat.M(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot= innerProduct(in,out); n1=real(dot);
    n2=norm2(out);
  }
  void Op(const Field &in, Field &out){
    assert(0);
    _Mat.M(in,out);
  }
 };
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
 // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
--- a/Grid/algorithms/approx/Forecast.h
+++ b/Grid/algorithms/approx/Forecast.h
@@ -36,11 +36,12 @@ NAMESPACE_BEGIN(Grid);
 // Abstract base class.
 // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
 // and returns a forecasted solution to the system D*psi = phi (psi).
-template<class Matrix, class Field>
+// Changing to operator
 template<class LinearOperatorBase, class Field>
 class Forecast
 {
 public:
-  virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
+  virtual Field operator()(LinearOperatorBase &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
 };
 // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
@@ -54,13 +55,13 @@ public:
  Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
  {
    int degree = prev_solns.size();
    std::cout << GridLogMessage << "ChronoForecast: degree= " << degree << std::endl;
    Field chi(phi); // forecasted solution
    // Trivial cases
    if(degree == 0){ chi = Zero(); return chi; }
    else if(degree == 1){ return prev_solns[0]; }
    //    RealD dot;
    ComplexD xp;
    Field r(phi); // residual
    Field Mv(phi);
@@ -83,8 +84,9 @@ public:
    // Perform sparse matrix multiplication and construct rhs
    for(int i=0; i<degree; i++){
      b[i] = innerProduct(v[i],phi);
-      Mat.M(v[i],Mv);
+//      Mat.M(v[i],Mv);
-      Mat.Mdag(Mv,MdagMv[i]);
+//      Mat.Mdag(Mv,MdagMv[i]);
      Mat.HermOp(v[i],MdagMv[i]);
      G[i][i] = innerProduct(v[i],MdagMv[i]);
    }
--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */
-zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  construct_partfrac(d);
  construct_contfrac(d);
-  /* Converting everything to ZOLO_PRECISION for external use only */
+  /* Converting everything to PRECISION for external use only */
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
+  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;
-  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);
-  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);
-  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);
-  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);
-  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);
  free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }
-zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
+zolotarev_data* higham(PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  /* Converting everything to PRECISION for external use only */
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
+  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;
-  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);
-  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);
-  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);
-  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);
-  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);
  free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST
 #undef ZERO
-#define ZERO ((ZOLO_PRECISION) 0)
+#define ZERO ((PRECISION) 0)
 #undef ONE
-#define ONE ((ZOLO_PRECISION) 1)
+#define ONE ((PRECISION) 1)
 #undef TWO
-#define TWO ((ZOLO_PRECISION) 2)
+#define TWO ((PRECISION) 2)
 /* Evaluate the rational approximation R(x) using the factored form */
-static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R;
+  PRECISION R;
  if (rdata -> type == 0) {
    R = rdata -> A * x;
@@ -551,9 +551,9 @@ static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
 /* Evaluate the rational approximation R(x) using the partial fraction form */
-static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
+  PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data*
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */
-static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R = rdata -> beta[0] * x;
+  PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    
 /* Evaluate the rational approximation R(x) using Cayley form */
-static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION T;
+  PRECISION T;
  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
-  ZOLO_PRECISION y;
+  PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;
@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
  }
  rdata = type == 2 
-    ? higham((ZOLO_PRECISION) eps, n) 
+    ? higham((PRECISION) eps, n) 
-    : zolotarev((ZOLO_PRECISION) eps, n, type);
+    : zolotarev((PRECISION) eps, n, type);
  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
+	 "\tPRECISION = " STRINGIFY(PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
+      y = zolotarev_eval((PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
 #ifndef ZOLOTAREV_INTERNAL
-#ifndef ZOLO_PRECISION
+#ifndef PRECISION
-#define ZOLO_PRECISION double
+#define PRECISION double
 #endif
-#define ZPRECISION ZOLO_PRECISION
+#define ZPRECISION PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif
@@ -77,8 +77,8 @@ typedef struct {
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */
-ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif
@@ -86,4 +86,3 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/blas/BatchedBlas.cc
+++ b/Grid/algorithms/blas/BatchedBlas.cc
@@ -1,34 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: BatchedBlas.h
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/algorithms/blas/BatchedBlas.h>
 NAMESPACE_BEGIN(Grid);
 gridblasHandle_t GridBLAS::gridblasHandle;
 int              GridBLAS::gridblasInit;
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -1,727 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: BatchedBlas.h
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #ifdef GRID_HIP
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
 #include <cublas_v2.h>
 #endif
 #ifdef GRID_SYCL
 #include <oneapi/mkl.hpp>
 #endif
 #if 0
 #define GRID_ONE_MKL
 #endif
 #ifdef GRID_ONE_MKL
 #include <oneapi/mkl.hpp>
 #endif
 ///////////////////////////////////////////////////////////////////////	  
 // Need to rearrange lattice data to be in the right format for a
 // batched multiply. Might as well make these static, dense packed
 ///////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 #ifdef GRID_HIP
  typedef hipblasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_CUDA
  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
  typedef cl::sycl::queue *gridblasHandle_t;
 #endif
 #ifdef GRID_ONE_MKL
  typedef cl::sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
 #endif
 enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
 class GridBLAS {
 public:
  static gridblasHandle_t gridblasHandle;
  static int            gridblasInit;
  static void Init(void)
  {
    if ( ! gridblasInit ) {
 #ifdef GRID_CUDA
      std::cout << "cublasCreate"<<std::endl;
      cublasCreate(&gridblasHandle);
      cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
 #endif
 #ifdef GRID_HIP
      std::cout << "hipblasCreate"<<std::endl;
      hipblasCreate(&gridblasHandle);
 #endif
 #ifdef GRID_SYCL
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
      cl::sycl::cpu_selector selector;
      cl::sycl::device selectedDevice { selector };
      gridblasHandle =new sycl::queue (selectedDevice);
 #endif
      gridblasInit=1;
    }
  }
  // Force construct once
  GridBLAS() { Init(); };
  ~GridBLAS() { };
  /////////////////////////////////////////////////////////////////////////////////////
  // BLAS GEMM conventions:
  /////////////////////////////////////////////////////////////////////////////////////
  // - C = alpha A * B + beta C
  // Dimensions:
  // - C_m.n
  // - A_m.k
  // - B_k.n
  // - Flops = 8 M N K
  // - Bytes = 2*sizeof(word) * (MN+MK+KN)
  // M=60, N=12
  // Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
  /////////////////////////////////////////////////////////////////////////////////////
  void synchronise(void)
  {
 #ifdef GRID_HIP
    auto err = hipDeviceSynchronize();
    assert(err==hipSuccess);
 #endif
 #ifdef GRID_CUDA
    auto err = cudaDeviceSynchronize();
    assert(err==cudaSuccess);
 #endif
 #ifdef GRID_SYCL
    accelerator_barrier();
 #endif
 #ifdef GRID_ONE_MKL
    gridblasHandle->wait();
 #endif
  }
  void gemmBatched(int m,int n, int k,
 		   ComplexD alpha,
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
 		   deviceVector<ComplexD*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   ComplexF alpha,
 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexF*> &Bkn,
 		   ComplexF beta,
 		   deviceVector<ComplexF*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
 		   RealD beta,
 		   deviceVector<RealD*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
 		   RealF beta,
 		   deviceVector<RealF*> &Cmn)
  {
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
 		Amk,
 		Bkn,
 		beta,
 		Cmn);
  }
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   ComplexD alpha,
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
 		   deviceVector<ComplexD*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<ComplexD> alpha_p(1);
    static deviceVector<ComplexD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
    RealD t0=usecond();
    //    std::cout << "ZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasZgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
 				   (hipblasDoubleComplex *) &alpha_p[0],
 				   (hipblasDoubleComplex **)&Amk[0], lda,
 				   (hipblasDoubleComplex **)&Bkn[0], ldb,
 				   (hipblasDoubleComplex *) &beta_p[0],
 				   (hipblasDoubleComplex **)&Cmn[0], ldc,
 				   batchCount);
    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasZgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (cuDoubleComplex *) &alpha_p[0],
 				  (cuDoubleComplex **)&Amk[0], lda,
 				  (cuDoubleComplex **)&Bkn[0], ldb,
 				  (cuDoubleComplex *) &beta_p[0],
 				  (cuDoubleComplex **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    //MKL’s cblas_<T>gemm_batch & OneAPI
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation
    int sda = lda*k;
    int sdb = ldb*k;
    int sdc = ldc*n;
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  ComplexD c_mn(0.0);
 	  for (int kk = 0; kk < k; ++kk)
 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
 	}
      }
    }
 #endif
    //    synchronise();
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
     //     std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   ComplexF alpha,
 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexF*> &Bkn,
 		   ComplexF beta,
 		   deviceVector<ComplexF*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<ComplexF> alpha_p(1);
    static deviceVector<ComplexF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasCgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
 				   (hipblasComplex *) &alpha_p[0],
 				   (hipblasComplex **)&Amk[0], lda,
 				   (hipblasComplex **)&Bkn[0], ldb,
 				   (hipblasComplex *) &beta_p[0],
 				   (hipblasComplex **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasCgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (cuComplex *) &alpha_p[0],
 				  (cuComplex **)&Amk[0], lda,
 				  (cuComplex **)&Bkn[0], ldb,
 				  (cuComplex *) &beta_p[0],
 				  (cuComplex **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    //MKL’s cblas_<T>gemm_batch & OneAPI
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    int sda = lda*k;
    int sdb = ldb*k;
    int sdc = ldc*n;
    ComplexF alphaf(real(alpha),imag(alpha));
    ComplexF betaf(real(beta),imag(beta));
    // Need a default/reference implementation
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  ComplexF c_mn(0.0);
 	  for (int kk = 0; kk < k; ++kk)
 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
 	}
      }
    }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
  }
  ///////////////////////////////////////////////////////////////////////////
  // Single precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
 		   RealF beta,
 		   deviceVector<RealF*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<RealF> alpha_p(1);
    static deviceVector<RealF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasSgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
 				   (float *) &alpha_p[0],
 				   (float **)&Amk[0], lda,
 				   (float **)&Bkn[0], ldb,
 				   (float *) &beta_p[0],
 				   (float **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasSgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (float *) &alpha_p[0],
 				  (float **)&Amk[0], lda,
 				  (float **)&Bkn[0], ldb,
 				  (float *) &beta_p[0],
 				  (float **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    //MKL’s cblas_<T>gemm_batch & OneAPI
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    int sda = lda*k;
    int sdb = ldb*k;
    int sdc = ldc*n;
    // Need a default/reference implementation
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  RealD c_mn(0.0);
 	  for (int kk = 0; kk < k; ++kk)
 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
 	}
      }
    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
  }
  ///////////////////////////////////////////////////////////////////////////
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
 		   RealD beta,
 		   deviceVector<RealD*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    if(OpA!=GridBLAS_OP_N)
      lda = k;
    if(OpB!=GridBLAS_OP_N)
      ldb = n;
    static deviceVector<RealD> alpha_p(1);
    static deviceVector<RealD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
    RealD t0=usecond();
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasDgemmBatched(gridblasHandle,
 				   HIPBLAS_OP_N,
 				   HIPBLAS_OP_N,
 				   m,n,k,
 				   (double *) &alpha_p[0],
 				   (double **)&Amk[0], lda,
 				   (double **)&Bkn[0], ldb,
 				   (double *) &beta_p[0],
 				   (double **)&Cmn[0], ldc,
 				   batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasOperation_t hOpA;
    cublasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasDgemmBatched(gridblasHandle,
 				  hOpA,
 				  hOpB,
 				  m,n,k,
 				  (double *) &alpha_p[0],
 				  (double **)&Amk[0], lda,
 				  (double **)&Bkn[0], ldb,
 				  (double *) &beta_p[0],
 				  (double **)&Cmn[0], ldc,
 				  batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    /*
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t batchCount64=batchCount;
      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
      onemkl::transpose::N,
      onemkl::transpose::N,
      &m64,&n64,&k64,
      (double *) &alpha_p[0],
      (double **)&Amk[0], lda,
      (double **)&Bkn[0], ldb,
      (double *) &beta_p[0],
      (double **)&Cmn[0], ldc,
      1,&batchCount64);
     */
    //MKL’s cblas_<T>gemm_batch & OneAPI
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    int sda = lda*k;
    int sdb = ldb*k;
    int sdc = ldc*n;
    // Need a default/reference implementation
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  RealD c_mn(0.0);
 	  for (int kk = 0; kk < k; ++kk)
 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
 	}
      }
    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Strided case used by benchmark, but generally unused in Grid
  // Keep a code example in double complex, but don't generate the single and real variants for now
  ////////////////////////////////////////////////////////////////////////////////////////////////
  void gemmStridedBatched(int m,int n, int k,
 			  ComplexD alpha,
 			  ComplexD* Amk,  // pointer list to matrices
 			  ComplexD* Bkn,
 			  ComplexD beta,
 			  ComplexD* Cmn,
 			  int batchCount)
  {
    // Use C-row major storage, so transpose calls
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    int sda = m*k;
    int sdb = k*n;
    int sdc = m*n;
    deviceVector<ComplexD> alpha_p(1);
    deviceVector<ComplexD> beta_p(1);
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
 #ifdef GRID_HIP
    auto err = hipblasZgemmStridedBatched(gridblasHandle,
 					  HIPBLAS_OP_N,
 					  HIPBLAS_OP_N,
 					  m,n,k,
 					  (hipblasDoubleComplex *) &alpha_p[0],
 					  (hipblasDoubleComplex *) Amk, lda, sda,
 					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
 					  (hipblasDoubleComplex *) &beta_p[0],
 					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
 					  batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasZgemmStridedBatched(gridblasHandle,
 			      CUBLAS_OP_N,
 			      CUBLAS_OP_N,
 			      m,n,k,
 			      (cuDoubleComplex *) &alpha_p[0],
 			      (cuDoubleComplex *) Amk, lda, sda,
 			      (cuDoubleComplex *) Bkn, ldb, sdb,
 			      (cuDoubleComplex *) &beta_p[0],
 			      (cuDoubleComplex *) Cmn, ldc, sdc,
 			      batchCount);
 #endif
 #if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						oneapi::mkl::transpose::N,
 						oneapi::mkl::transpose::N,
 						m,n,k,
 						alpha,
 						(const ComplexD *)Amk,lda,sda,
 						(const ComplexD *)Bkn,ldb,sdb,
 						beta,
 						(ComplexD *)Cmn,ldc,sdc,
 						batchCount);
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
     // Need a default/reference implementation
     for (int p = 0; p < batchCount; ++p) {
       for (int mm = 0; mm < m; ++mm) {
 	 for (int nn = 0; nn < n; ++nn) {
 	   ComplexD c_mn(0.0);
 	   for (int kk = 0; kk < k; ++kk)
 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
 	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
 	 }
       }
     }
 #endif
  }
  double benchmark(int M, int N, int K, int BATCH)
  {
    int32_t N_A = M*K*BATCH;
    int32_t N_B = K*N*BATCH;
    int32_t N_C = M*N*BATCH;
    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
    ComplexD alpha(1.0);
    ComplexD beta (1.0);
    RealD flops = 8.0*M*N*K*BATCH;
    int ncall=10;
    RealD t0 = usecond();
    for(int i=0;i<ncall;i++){
      gemmStridedBatched(M,N,K,
 			 alpha,
 			 &A[0], // m x k 
 			 &B[0], // k x n
 			 beta, 
 			 &C[0], // m x n
 			 BATCH);
    }
    synchronise();
    RealD t1 = usecond();
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
    flops = 8.0*M*N*K*BATCH*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -176,7 +176,6 @@ template<class T> using cshiftAllocator = std::allocator<T>;
 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
 template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
 template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
 NAMESPACE_END(Grid);
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -29,27 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-extern std::vector<std::pair<int,int> > Cshift_table; 
+extern Vector<std::pair<int,int> > Cshift_table; 
 extern commVector<std::pair<int,int> > Cshift_table_device; 
 inline std::pair<int,int> *MapCshiftTable(void)
 {
  // GPU version
 #ifdef ACCELERATOR_CSHIFT    
  uint64_t sz=Cshift_table.size();
  if (Cshift_table_device.size()!=sz )    {
    Cshift_table_device.resize(sz);
  }
  acceleratorCopyToDevice((void *)&Cshift_table[0],
 			  (void *)&Cshift_table_device[0],
 			  sizeof(Cshift_table[0])*sz);
  return &Cshift_table_device[0];
 #else 
  return &Cshift_table[0];
 #endif
  // CPU version use identify map
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@@ -93,7 +74,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  }
  {
    auto buffer_p = & buffer[0];
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
@@ -244,7 +225,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  {
    auto buffer_p = & buffer[0];
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
@@ -316,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 template <typename T>
 T iDivUp(T a, T b) // Round a / b to nearest higher integer value
 { return (a % b != 0) ? (a / b + 1) : (a / b); }
 template <typename T>
 __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
 {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >= e1*e2) return;
    int n, b, o;
    n = idx / e2;
    b = idx % e2;
    o = n*stride + b;
    vector[2*idx + 0] = lo + o;
    vector[2*idx + 1] = ro + o;
 }
 #endif
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -340,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;
  if(cbmask == 0x3 ){
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
 #endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -359,7 +372,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }
  {
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
@@ -396,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;
  if ( cbmask == 0x3 ) {
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
 #endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@@ -411,7 +432,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }
  {
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -52,8 +52,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
-  RealD t1,t0;
+
  t0=usecond();
  if ( !comm_dim ) {
    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@@ -64,8 +63,6 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
@@ -130,20 +127,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-  RealD tcopy=0.0;
+
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
-      tcopy-=usecond();
+
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
+
    } else {
      int words = buffer_size;
@@ -151,39 +144,26 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-      tcomms-=usecond();
+      grid->Barrier();
      //      grid->Barrier();
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      //      grid->Barrier();
      tcomms+=usecond();
-      tscatter-=usecond();
+      grid->Barrier();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -210,12 +190,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
@@ -253,9 +227,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
    tgather-=usecond();
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
@@ -280,8 +252,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-	tcomms-=usecond();
+	grid->Barrier();
 	//	grid->Barrier();
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
@@ -291,9 +262,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);
-	xbytes+=bytes;
+	grid->Barrier();
 	//	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -301,17 +270,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
-  /*
+
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -331,11 +292,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
@@ -359,9 +315,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    if (comm_proc==0) {
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
    } else {
@@ -370,9 +324,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int bytes = words * sizeof(vobj);
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      tgather+=usecond();
      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -380,8 +332,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-      tcomms-=usecond();
+      grid->Barrier();
      //      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
@@ -389,24 +340,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-      //      grid->Barrier();
+      grid->Barrier();
      tcomms+=usecond();
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
      tscatter+=usecond();
    }
  }
  /*
  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  */
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -432,11 +372,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  RealD tcopy=0.0;
  RealD tgather=0.0;
  RealD tscatter=0.0;
  RealD tcomms=0.0;
  uint64_t xbytes=0;
  int permute_type=grid->PermuteType(dimension);
@@ -479,10 +414,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    tgather+=usecond();
    for(int i=0;i<Nsimd;i++){
@@ -507,8 +440,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-	tcomms-=usecond();
+	grid->Barrier();
 	//	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
@@ -517,28 +449,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	xbytes+=bytes;
-	//	grid->Barrier();
+	grid->Barrier();
 	tcomms+=usecond();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
-  /*
+
  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
  */
 }
 #endif
 NAMESPACE_END(Grid); 
--- a/Grid/cshift/Cshift_table.cc
+++ b/Grid/cshift/Cshift_table.cc
@@ -1,5 +1,4 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
-std::vector<std::pair<int,int> > Cshift_table; 
+Vector<std::pair<int,int> > Cshift_table; 
 commVector<std::pair<int,int> > Cshift_table_device; 
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@@ -47,4 +46,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -270,42 +270,5 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L
    return axpby_norm_fast(ret,a,b,x,y);
 }
 /// Trace product
 template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
  -> Lattice<decltype(trace(obj()))>
 {
  typedef decltype(trace(obj())) robj;
  Lattice<robj> ret_i(rhs_1.Grid());
  autoView( rhs1 , rhs_1, AcceleratorRead);
  autoView( rhs2 , rhs_2, AcceleratorRead);
  autoView( ret , ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs_1.Checkerboard();
  accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
  });
  return ret_i;
 }
 template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
  -> Lattice<decltype(trace(obj1()))>
 {
  typedef decltype(trace(obj1())) robj;
  Lattice<robj> ret_i(rhs_1.Grid());
  autoView( rhs1 , rhs_1, AcceleratorRead);
  autoView( ret , ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs_1.Checkerboard();
  accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
  });
  return ret_i;
 }
 template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
  -> Lattice<decltype(trace(obj1()))>
 {
  return traceProduct(rhs_1,rhs_2);
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }
-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
+#if ( (!defined(GRID_CUDA)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
  }
 }
-template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
-#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -31,7 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
 #include <Grid/lattice/Lattice_slicesum_core.h>
 NAMESPACE_BEGIN(Grid);
@@ -285,7 +284,6 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
  ComplexD nrm = rankInnerProduct(left,right);
  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
  grid->GlobalSum(nrm);
  return nrm;
 }
@@ -450,10 +448,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  int ostride=grid->_ostride[orthogdim];
-  //Reduce Data down to lvSum
+  // sum over reduced dimension planes, breaking out orthog dir
-  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
+  // Parallel over orthog direction
  autoView( Data_v, Data, CpuRead);
  thread_for( r,rd, {
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	lvSum[r]=lvSum[r]+Data_v[ss];
      }
    }
  });
  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
@@ -497,7 +504,6 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  auto r=hipGetDevice(&device);
+  hipGetDevice(&device);
 #endif
  Iterator warpSize            = gpu_props[device].warpSize;
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -152,7 +152,6 @@ public:
 #ifdef RNG_FAST_DISCARD
  static void Skip(RngEngine &eng,uint64_t site)
  {
 #if 0
    /////////////////////////////////////////////////////////////////////////////////////
    // Skip by 2^40 elements between successive lattice sites
    // This goes by 10^12.
@@ -163,9 +162,9 @@ public:
    // tens of seconds per trajectory so this is clean in all reasonable cases,
    // and margin of safety is orders of magnitude.
    // We could hack Sitmo to skip in the higher order words of state if necessary
-    //
+      //
-    // Replace with 2^30 ; avoid problem on large volumes
+      // Replace with 2^30 ; avoid problem on large volumes
-    //
+      //
    /////////////////////////////////////////////////////////////////////////////////////
    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
    const int shift = 30;
@@ -180,9 +179,6 @@ public:
    assert((skip >> shift)==site); // check for overflow
    eng.discard(skip);
 #else
    eng.discardhi(site);
 #endif
    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
  } 
 #endif
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,213 +0,0 @@
 #pragma once
 #include <type_traits>
 #if defined(GRID_CUDA)
 #include <cub/cub.cuh>
 #define gpucub cub
 #define gpuError_t cudaError_t
 #define gpuSuccess cudaSuccess
 #elif defined(GRID_HIP)
 #include <hipcub/hipcub.hpp>
 #define gpucub hipcub
 #define gpuError_t hipError_t
 #define gpuSuccess hipSuccess
 #endif
 NAMESPACE_BEGIN(Grid);
 #if defined(GRID_CUDA) || defined(GRID_HIP)
 template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  size_t subvol_size = e1*e2;
  commVector<vobj> reduction_buffer(rd*subvol_size);
  auto rb_p = &reduction_buffer[0];
  vobj zero_init;
  zeroit(zero_init);
  void *temp_storage_array = NULL;
  size_t temp_storage_bytes = 0;
  vobj *d_out;
  int* d_offsets;
  std::vector<int> offsets(rd+1,0);
  for (int i = 0; i < offsets.size(); i++) {
    offsets[i] = i*subvol_size;
  }
  //Allocate memory for output and offset arrays on device
  d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  //copy offsets to device
  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }
  //allocate memory for temp_storage_array  
  temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
  //prepare buffer for reduction
  //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
  //use 2d accelerator_for to avoid launch latencies found when serially looping over rd 
  accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{ 
    int n = s / e2;
    int b = s % e2;
    int so=r*ostride; // base offset for start of plane 
    int ss= so+n*stride+b;
    coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
  });
  //issue segmented reductions in computeStream
  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }
  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  //sync after copy
  accelerator_barrier();
  acceleratorFreeDevice(temp_storage_array);
  acceleratorFreeDevice(d_out);
  acceleratorFreeDevice(d_offsets);
 }
 template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
  commVector<vector>buffer(osites);
  vector *dat = (vector *)Data;
  vector *buf = &buffer[0];
  Vector<vector> lvSum_small(rd);
  vector *lvSum_ptr = (vector *)&lvSum[0];
  for (int w = 0; w < words; w++) {
    accelerator_for(ss,osites,1,{
 	    buf[ss] = dat[ss*words+w];
    });
    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
  }
 }
 template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
    else {
      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
 #endif
 #if defined(GRID_SYCL)
 template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  typedef typename vobj::scalar_object sobj;
  size_t subvol_size = e1*e2;
  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
  autoView(Data_v, Data, AcceleratorRead);
  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
      int n = s / e2;
      int b = s % e2;
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
  });
  for (int r = 0; r < rd; r++) {
      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
          Reduction,
          [=](cl::sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
      theGridAccelerator->wait();
      lvSum[r] = mysum[0];
  }
  free(mysum,*theGridAccelerator);
 }
 #endif
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
  autoView( Data_v, Data, CpuRead);
  thread_for( r,rd, {
    int so=r*ostride; // base offset for start of plane 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int ss= so+n*stride+b;
        lvSum[r]=lvSum[r]+Data_v[ss];
      }
    }
  });
 }
 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
  #if defined(GRID_CUDA) || defined(GRID_HIP)
  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #elif defined(GRID_SYCL)
  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #endif
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -469,13 +469,15 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  vobj zz = Zero();
  accelerator_for(sc,coarse->oSites(),1,{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
-      vobj cd = Zero();
+      vobj cd = zz;
      for(int sb=0;sb<blockVol;sb++){
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -45,7 +45,6 @@ public:
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
  vobj* getHostPointer(void) const { return _odata; };
 };
 /////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -179,11 +179,11 @@ extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
-extern GridLogger GridLogDebug;
+extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogDslash;
-extern GridLogger GridLogIterative;
+extern GridLogger GridLogIterative  ;
-extern GridLogger GridLogIntegrator;
+extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern GridLogger GridLogMemory;
 extern GridLogger GridLogTracing;
@@ -191,41 +191,6 @@ extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
 template<typename... Args>
 inline std::string sjoin(Args&&... args) noexcept {
    std::ostringstream msg;
    (msg << ... << args);
    return msg.str();
 }
 /*!  @brief make log messages work like python print */
 template <typename... Args>
 inline void Grid_log(Args&&... args) {
    std::string msg = sjoin(std::forward<Args>(args)...);
    std::cout << GridLogMessage << msg << std::endl;
 }
 /*!  @brief make warning messages work like python print */
 template <typename... Args>
 inline void Grid_warn(Args&&... args) {
    std::string msg = sjoin(std::forward<Args>(args)...);
    std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
 }
 /*!  @brief make error messages work like python print */
 template <typename... Args>
 inline void Grid_error(Args&&... args) {
    std::string msg = sjoin(std::forward<Args>(args)...);
    std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
 }
 /*!  @brief make pass messages work like python print */
 template <typename... Args>
 inline void Grid_pass(Args&&... args) {
    std::string msg = sjoin(std::forward<Args>(args)...);
    std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
 }
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@@ -34,7 +34,7 @@ class GridTracer {
 };
 inline void tracePush(const char *name) { roctxRangePushA(name); }
 inline void tracePop(const char *name) { roctxRangePop(); }
-inline int  traceStart(const char *name) { return roctxRangeStart(name); }
+inline int  traceStart(const char *name) { roctxRangeStart(name); }
 inline void traceStop(int ID) { roctxRangeStop(ID); }
 #endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -129,22 +129,6 @@ public:
  virtual ~Action(){}
 };
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
  ///////////////////////////////
  // Logging
  ///////////////////////////////
  virtual std::string action_name()    { return std::string("Level Force Log"); };
  virtual std::string LogParameters()  { return std::string("No parameters");};
 };
 NAMESPACE_END(Grid);
 #endif // ACTION_BASE_H
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -67,6 +67,7 @@ NAMESPACE_CHECK(Scalar);
 #include <Grid/qcd/utils/Metric.h>
 NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
 #include <Grid/qcd/utils/CovariantLaplacianRat.h>
 NAMESPACE_CHECK(CovariantLaplacian);
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -65,6 +65,19 @@ struct WilsonImplParams {
  }
 };
 struct GaugeImplParams {
 //  bool overlapCommsCompute;
 //  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  GaugeImplParams()  {
    boundary_phases.resize(Nd, 1.0);
 //      twist_n_2pi_L.resize(Nd, 0.0);
  };
  GaugeImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi) {
 //    twist_n_2pi_L.resize(Nd, 0.0);
  }
 };
 struct StaggeredImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -63,8 +63,6 @@ public:
  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
  virtual void M(const FermionField &in, FermionField &out) ;
  virtual void Mdag(const FermionField &in, FermionField &out) ;
 private:
  RealD mu; // TwistedMass parameter
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -280,16 +280,20 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
 #endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
@@ -318,13 +322,19 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
 #endif
  }
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@@ -93,25 +93,5 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
  RealD b    = tm /sq;
  axpibg5x(out,in,a,b);
 }
 template<class Impl>
 void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerNo);
  FermionField tmp(out.Grid());
  RealD a = 4.0+this->mass;
  RealD b = this->mu;
  axpibg5x(tmp,in,a,b);
  axpy(out, 1.0, tmp, out);
 }
 template<class Impl>
 void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  this->Dhop(in, out, DaggerYes);
  FermionField tmp(out.Grid());
  RealD a = 4.0+this->mass;
  RealD b = -this->mu;
  axpibg5x(tmp,in,a,b);
  axpy(out, 1.0, tmp, out);
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -32,7 +32,7 @@ directory
 NAMESPACE_BEGIN(Grid);
-#define CPS_MD_TIME
+#undef CPS_MD_TIME
 #ifdef CPS_MD_TIME
 #define HMC_MOMENTUM_DENOMINATOR (2.0)
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -42,9 +42,13 @@ template <class Gimpl>
 class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 public:  
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef GaugeImplParams ImplParams;
  ImplParams Params;
  /////////////////////////// constructors
-  explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
+  explicit WilsonGaugeAction(RealD beta_,
 		  const ImplParams &p = ImplParams()
 		  ):beta(beta_),Params(p){};
  virtual std::string action_name() {return "WilsonGaugeAction";}
@@ -56,14 +60,53 @@ public:
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){};  // noop as no pseudoferms
 // Umu<->U maximally confusing
  virtual void boundary(const GaugeField &Umu, GaugeField &Ub){
    typedef typename Simd::scalar_type scalar_type;
    assert(Params.boundary_phases.size() == Nd);
    GridBase *GaugeGrid=Umu.Grid();
    GaugeLinkField U(GaugeGrid);
    GaugeLinkField tmp(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
    for (int mu = 0; mu < Nd; mu++) {
 	////////// boundary phase /////////////
      auto pha = Params.boundary_phases[mu];
      scalar_type phase( real(pha),imag(pha) );
      std::cout<< GridLogIterative << "[WilsonGaugeAction] boundary "<<mu<<" "<<phase<< std::endl; 
 	int L   = GaugeGrid->GlobalDimensions()[mu];
        int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu);
      tmp = where(coor == Lmu, phase * U, U);
      PokeIndex<LorentzIndex>(Ub, tmp, mu);
 //      PokeIndex<LorentzIndex>(Ub, U, mu);
 //      PokeIndex<LorentzIndex>(Umu, tmp, mu);
    }
  };
  virtual RealD S(const GaugeField &U) {
-    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    GaugeField Ub(U.Grid());
-    RealD vol = U.Grid()->gSites();
+    this->boundary(U,Ub);
    static RealD lastG=0.;
    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(Ub);
    RealD vol = Ub.Grid()->gSites();
    RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5;
    std::cout << GridLogMessage << "[WilsonGaugeAction] dH: " << action-lastG << std::endl;
    RealD plaq_o = WilsonLoops<Gimpl>::avgPlaquette(U);
    RealD action_o = beta * (1.0 - plaq_o) * (Nd * (Nd - 1.0)) * vol * 0.5;
    std::cout << GridLogMessage << "[WilsonGaugeAction] U: " << action_o <<" Ub: "<< action  << std::endl;
    lastG=action;
    return action;
  };
  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    GaugeField Ub(U.Grid());
    this->boundary(U,Ub);
    // not optimal implementation FIXME
    // extend Ta to include Lorentz indexes
@@ -73,10 +116,9 @@ public:
    GaugeLinkField dSdU_mu(U.Grid());
    for (int mu = 0; mu < Nd; mu++) {
-      Umu = PeekIndex<LorentzIndex>(U, mu);
+      Umu = PeekIndex<LorentzIndex>(Ub, mu);
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, Ub, mu);
      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -178,7 +178,10 @@ NAMESPACE_BEGIN(Grid);
        // Use chronological inverter to forecast solutions across poles
        std::vector<FermionField> prev_solns;
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
-        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
+	MdagMLinearOperator<AbstractEOFAFermion<Impl> ,FermionField> MdagML(Lop);
 	MdagMLinearOperator<AbstractEOFAFermion<Impl> ,FermionField> MdagMR(Rop);
 //        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
 	ChronoForecast<MdagMLinearOperator<AbstractEOFAFermion<Impl>, FermionField> , FermionField> Forecast;
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
@@ -198,7 +201,7 @@ NAMESPACE_BEGIN(Grid);
          heatbathRefreshShiftCoefficients(0, -gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
-            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
+            CG_soln = Forecast(MdagML, Forecast_src, prev_solns);
            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
@@ -225,7 +228,7 @@ NAMESPACE_BEGIN(Grid);
 	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
-            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
+            CG_soln = Forecast(MdagMR, Forecast_src, prev_solns);
            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
--- a/Grid/qcd/action/scalar/ScalarImpl.h
+++ b/Grid/qcd/action/scalar/ScalarImpl.h
@@ -1,6 +1,6 @@
 #pragma once
-#define CPS_MD_TIME 
+#undef CPS_MD_TIME 
 #ifdef CPS_MD_TIME
 #define HMC_MOMENTUM_DENOMINATOR (2.0)
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -121,12 +121,19 @@ public:
  template <class SmearingPolicy>
  void Run(SmearingPolicy &S) {
-    Runner(S);
+    TrivialMetric<typename Implementation::Field> Mtr;
    Runner(S,Mtr);
  }
  template <class SmearingPolicy, class Metric>
  void Run(SmearingPolicy &S, Metric &Mtr) {
    Runner(S,Mtr);
  }
  void Run(){
    NoSmearing<Implementation> S;
-    Runner(S);
+    TrivialMetric<typename Implementation::Field> Mtr;
    Runner(S,Mtr);
  }
  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
@@ -176,15 +183,15 @@ public:
  //////////////////////////////////////////////////////////////////
 private:
-  template <class SmearingPolicy>
+  template <class SmearingPolicy, class Metric>
-  void Runner(SmearingPolicy &Smearing) {
+  void Runner(SmearingPolicy &Smearing, Metric &Mtr) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);
    initializeGaugeFieldAndRNGs(U);
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
+    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing,Mtr);
    // Sets the momentum filter
    MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -55,6 +55,8 @@ struct HMCparameters: Serializable {
                                  Integer, NoMetropolisUntil,
 				  bool, PerformRandomShift, /* @brief Randomly shift the gauge configuration at the start of a trajectory */
                                  std::string, StartingType,
 				  Integer, SW,
                                  RealD, Kappa,
                                  IntegratorParameters, MD)
  HMCparameters() {
@@ -110,6 +112,8 @@ private:
  IntegratorType &TheIntegrator;
  ObsListType Observables;
  int traj_num;
  /////////////////////////////////////////////////////////
  // Metropolis step
  /////////////////////////////////////////////////////////
@@ -200,14 +204,14 @@ private:
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << " Molecular Dynamics evolution ";
-    TheIntegrator.integrate(U);
+    TheIntegrator.integrate(U,traj_num);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // updated state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "Compute final action";
+    std::cout << GridLogMessage << "Compute final action" <<std::endl;
    RealD H1 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
@@ -242,7 +246,7 @@ public:
  HybridMonteCarlo(HMCparameters _Pams, IntegratorType &_Int,
                   GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, 
                   ObsListType _Obs, Field &_U)
-    : Params(_Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Observables(_Obs), Ucur(_U) {}
+    : Params(_Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Observables(_Obs), Ucur(_U),traj_num(0) {}
  ~HybridMonteCarlo(){};
  void evolve(void) {
@@ -258,8 +262,9 @@ public:
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      traj_num=traj;
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <cossu@post.kek.jp>
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@ directory
 #define INTEGRATOR_INCLUDED
 #include <memory>
 #include <Grid/parallelIO/NerscIO.h>
 NAMESPACE_BEGIN(Grid);
@@ -41,10 +43,19 @@ public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(IntegratorParameters,
 				  std::string, name,      // name of the integrator
 				  unsigned int, MDsteps,  // number of outer steps
 				  RealD, RMHMCTol,
                                  RealD, RMHMCCGTol,
                                  RealD, lambda0,
                                  RealD, lambda1,
                                  RealD, lambda2,
 				  RealD, trajL)           // trajectory length
  IntegratorParameters(int MDsteps_ = 10, RealD trajL_ = 1.0)
  : MDsteps(MDsteps_),
   lambda0(0.1931833275037836),
   lambda1(0.1931833275037836),
   lambda2(0.1931833275037836),
   RMHMCTol(1e-8),RMHMCCGTol(1e-8),
    trajL(trajL_) {};
  template <class ReaderClass, typename std::enable_if<isReader<ReaderClass>::value, int >::type = 0 >
@@ -75,11 +86,14 @@ public:
  double t_U;  // Track time passing on each level and for U and for P
  std::vector<double> t_P;  
-  MomentaField P;
+//  MomentaField P;
  GeneralisedMomenta<FieldImplementation > P;
  SmearingPolicy& Smearer;
  RepresentationPolicy Representations;
  IntegratorParameters Params;
  RealD Saux,Smom,Sg;
  //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
  //It is applied whenever the momentum is updated / refreshed
  //The default filter does nothing
@@ -87,8 +101,6 @@ public:
  const ActionSet<Field, RepresentationPolicy> as;
  ActionSet<Field,RepresentationPolicy> LevelForces;
  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
    static MomentumFilterNone<MomentaField> filter;
@@ -98,7 +110,16 @@ public:
  void update_P(Field& U, int level, double ep) 
  {
    t_P[level] += ep;
-    update_P(P, U, level, ep);
+    update_P(P.Mom, U, level, ep);
    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }
  void update_P2(Field& U, int level, double ep) 
  {
    t_P[level] += ep;
    update_P2(P.Mom, U, level, ep);
    std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
  }
@@ -121,78 +142,174 @@ public:
    }
  } update_P_hireps{};
  void update_P(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing
    assert(as.size()==LevelForces.size());
    Field level_force(U.Grid()); level_force =Zero();
    for (int a = 0; a < as[level].actions.size(); ++a) {
      double start_full = usecond();
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
-      as[level].actions.at(a)->deriv_timer_start();
+      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
-      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
+      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      as[level].actions.at(a)->deriv_timer_stop();
      auto name = as[level].actions.at(a)->action_name();
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      MomFilter->applyFilter(force);
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
      // track the total
      level_force = level_force+force;
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
      as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] dt           : " << ep <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }
    {
      // total force
      Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real force_max   = std::sqrt(maxLocalNorm2(level_force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
      LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
    }
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
  }
  void update_P2(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing
    std::cout << GridLogIntegrator << "U before update_P2: " << std::sqrt(norm2(U)) << std::endl;
    // Generalised momenta  
    // Derivative of the kinetic term must be computed before
    // Mom is the momenta and gets updated by the 
    // actions derivatives
    MomentaField MomDer(P.Mom.Grid());
    P.M.ImportGauge(U);
    P.DerivativeU(P.Mom, MomDer);
    std::cout << GridLogIntegrator << "MomDer update_P2: " << std::sqrt(norm2(MomDer)) << std::endl;
 //    Mom -= MomDer * ep;
    Mom -= MomDer * ep * HMC_MOMENTUM_DENOMINATOR;
    std::cout << GridLogIntegrator << "Mom update_P2: " << std::sqrt(norm2(Mom)) << std::endl;
    // Auxiliary fields
    P.update_auxiliary_momenta(ep*0.5 );
    P.AuxiliaryFieldsDerivative(MomDer);
    std::cout << GridLogIntegrator << "MomDer(Aux) update_P2: " << std::sqrt(norm2(Mom)) << std::endl;
 //    Mom -= MomDer * ep;
    Mom -= MomDer * ep * HMC_MOMENTUM_DENOMINATOR;
    P.update_auxiliary_momenta(ep*0.5 );
    for (int a = 0; a < as[level].actions.size(); ++a) {
      double start_full = usecond();
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
  }
  void implicit_update_P(Field& U, int level, double ep, double ep1, bool intermediate = false) {
    t_P[level] += ep;
    double ep2= ep-ep1;
    std::cout << GridLogIntegrator << "[" << level << "] P "
              << " dt " << ep << " : t_P " << t_P[level] << std::endl;
    std::cout << GridLogIntegrator << "U before implicit_update_P: " << std::sqrt(norm2(U)) << std::endl;
    // Fundamental updates, include smearing
    MomentaField Msum(P.Mom.Grid());
    Msum = Zero();
    for (int a = 0; a < as[level].actions.size(); ++a) {
      // Compute the force terms for the lagrangian part
      // We need to compute the derivative of the actions
      // only once
      Field force(U.Grid());
      conformable(U.Grid(), P.Mom.Grid());
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force);  // Ta for gauge fields
      Real force_abs = std::sqrt(norm2(force) / U.Grid()->gSites());
      std::cout << GridLogIntegrator << "|Force| site average: " << force_abs
                << std::endl;
      Msum += force;
    }
    MomentaField NewMom = P.Mom;
    MomentaField OldMom = P.Mom;
    double threshold = Params.RMHMCTol;
    P.M.ImportGauge(U);
    MomentaField MomDer(P.Mom.Grid());
    MomentaField MomDer1(P.Mom.Grid());
    MomentaField AuxDer(P.Mom.Grid());
    MomDer1 = Zero();
    MomentaField diff(P.Mom.Grid());
    double factor = 2.0;
    if (intermediate){
      P.DerivativeU(P.Mom, MomDer1);
      factor = 1.0;
    }
 //    std::cout << GridLogIntegrator << "MomDer1 implicit_update_P: " << std::sqrt(norm2(MomDer1)) << std::endl;
    // Auxiliary fields
    P.update_auxiliary_momenta(ep1);
    P.AuxiliaryFieldsDerivative(AuxDer);
    Msum += AuxDer;
    // Here run recursively
    int counter = 1;
    RealD RelativeError;
    do {
      std::cout << GridLogIntegrator << "UpdateP implicit step "<< counter << std::endl;
      // Compute the derivative of the kinetic term
      // with respect to the gauge field
      P.DerivativeU(NewMom, MomDer);
      Real force_abs = std::sqrt(norm2(MomDer) / U.Grid()->gSites());
      std::cout << GridLogIntegrator << "|Force| laplacian site average: " << force_abs
                << std::endl;
 //      NewMom = P.Mom - ep* 0.5 * HMC_MOMENTUM_DENOMINATOR * (2.0*Msum + factor*MomDer + MomDer1);// simplify
      NewMom = P.Mom -  HMC_MOMENTUM_DENOMINATOR * (ep*Msum + ep1* factor*MomDer + ep2* MomDer1);// simplify
      diff = NewMom - OldMom;
      counter++;
      RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewMom));
      std::cout << GridLogIntegrator << "UpdateP RelativeError: " << RelativeError << std::endl;
      OldMom = NewMom;
    } while (RelativeError > threshold);
    P.Mom = NewMom;
    std::cout << GridLogIntegrator << "NewMom implicit_update_P: " << std::sqrt(norm2(NewMom)) << std::endl;
    // update the auxiliary fields momenta    
    P.update_auxiliary_momenta(ep2);
  }
  void implicit_update_P(Field& U, int level, double ep, bool intermediate = false) {
      implicit_update_P( U, level, ep, ep*0.5, intermediate ); 
  }
  void update_U(Field& U, double ep) 
  {
-    update_U(P, U, ep);
+    update_U(P.Mom, U, ep);
    t_U += ep;
    int fl = levels - 1;
@@ -201,12 +318,8 @@ public:
  void update_U(MomentaField& Mom, Field& U, double ep) 
  {
    MomentaField MomFiltered(Mom.Grid());
    MomFiltered = Mom;
    MomFilter->applyFilter(MomFiltered);
    // exponential of Mom*U in the gauge fields case
-    FieldImplementation::update_field(MomFiltered, U, ep);
+    FieldImplementation::update_field(Mom, U, ep);
    // Update the smeared fields, can be implemented as observer
    Smearer.set_Field(U);
@@ -215,18 +328,74 @@ public:
    Representations.update(U);  // void functions if fundamental representation
  }
  void implicit_update_U(Field&U, double ep, double ep1 ){
    double ep2=ep-ep1;
    t_U += ep;
    int fl = levels - 1;
    std::cout << GridLogIntegrator << "   " << "[" << fl << "] U " << " dt " << ep << " : t_U " << t_U << std::endl;
    std::cout << GridLogIntegrator << "U before implicit_update_U: " << std::sqrt(norm2(U)) << std::endl;
    MomentaField Mom1(P.Mom.Grid());
    MomentaField Mom2(P.Mom.Grid());
    RealD RelativeError;
    Field diff(U.Grid());
    Real threshold =  Params.RMHMCTol;
    int counter = 1;
    int MaxCounter = 100;
    Field OldU = U;
    Field NewU = U;
    P.M.ImportGauge(U);
    P.DerivativeP(Mom1); // first term in the derivative 
    std::cout << GridLogIntegrator << "implicit_update_U: Mom1: " << std::sqrt(norm2(Mom1)) << std::endl;
    P.update_auxiliary_fields(ep1);
    MomentaField sum=Mom1;
    do {
      std::cout << GridLogIntegrator << "UpdateU implicit step "<< counter << std::endl;
      P.DerivativeP(Mom2); // second term in the derivative, on the updated U
      std::cout << GridLogIntegrator << "implicit_update_U: Mom1: " << std::sqrt(norm2(Mom1)) << std::endl;
      sum = (Mom1*ep1 + Mom2*ep2);
      for (int mu = 0; mu < Nd; mu++) {
        auto Umu = PeekIndex<LorentzIndex>(U, mu);
        auto Pmu = PeekIndex<LorentzIndex>(sum, mu);
        Umu = expMat(Pmu, 1, 12) * Umu;
        PokeIndex<LorentzIndex>(NewU, ProjectOnGroup(Umu), mu);
      }
      diff = NewU - OldU;
      RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewU));
      std::cout << GridLogIntegrator << "UpdateU RelativeError: " << RelativeError << std::endl;
      P.M.ImportGauge(NewU);
      OldU = NewU; // some redundancy to be eliminated
      counter++;
    } while (RelativeError > threshold && counter < MaxCounter);
    U = NewU;
    std::cout << GridLogIntegrator << "NewU implicit_update_U: " << std::sqrt(norm2(U)) << std::endl;
    P.update_auxiliary_fields(ep2);
  }
  virtual void step(Field& U, int level, int first, int last) = 0;
 public:
  Integrator(GridBase* grid, IntegratorParameters Par,
             ActionSet<Field, RepresentationPolicy>& Aset,
-             SmearingPolicy& Sm)
+             SmearingPolicy& Sm, Metric<MomentaField>& M)
    : Params(Par),
      as(Aset),
-      P(grid),
+      P(grid, M),
      levels(Aset.size()),
      Smearer(Sm),
-      Representations(grid) 
+      Representations(grid),
      Saux(0.),Smom(0.),Sg(0.)
  {
    t_P.resize(levels, 0.0);
    t_U = 0.0;
@@ -234,16 +403,6 @@ public:
    //Default the momentum filter to "do-nothing"
    MomFilter = getDefaultMomFilter();
    for (int level = 0; level < as.size(); ++level) {
      int multiplier = as.at(level).multiplier;
      ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
      Level->push_back(new EmptyAction<Field>); 
      LevelForces.push_back(*Level);
      // does it copy by value or reference??
      // - answer it copies by value, BUT the action level contains a reference that is NOT updated.
      // Unsafe code in Guido's area
    }
  };
  virtual ~Integrator() {}
@@ -261,14 +420,10 @@ public:
  void reset_timer(void)
  {
    assert(as.size()==LevelForces.size());
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
      int actionID=0;
      assert(LevelForces.at(level).actions.size()==1);
      LevelForces.at(level).actions.at(actionID)->reset_timer();
    }
  }
  void print_timer(void)
@@ -330,16 +485,6 @@ public:
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
      int actionID=0;
      std::cout << GridLogMessage 
 		  << LevelForces[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t "
 		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average()
 		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average()
 		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num
 		  << std::endl;
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
@@ -361,19 +506,13 @@ public:
 	std::cout << as[level].actions.at(actionID)->LogParameters();
      }
    }
    std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
    for (int level = 0; level < LevelForces.size(); ++level) {
      std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
      for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void reverse_momenta()
  {
-    P *= -1.0;
+    P.Mom *= -1.0;
    P.AuxMom *= -1.0;
  }
  // to be used by the actionlevel class to iterate
@@ -392,11 +531,14 @@ public:
  // Initialization of momenta and actions
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
-    assert(P.Grid() == U.Grid());
+    assert(P.Mom.Grid() == U.Grid());
    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;
    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
-    FieldImplementation::generate_momenta(P, sRNG, pRNG);
+//    FieldImplementation::generate_momenta(P.Mom, sRNG, pRNG);
    P.M.ImportGauge(U);
    P.MomentaDistribution(sRNG,pRNG);
    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
@@ -449,12 +591,24 @@ public:
  RealD S(Field& U) 
  {  // here also U not used
    assert(as.size()==LevelForces.size());
    std::cout << GridLogIntegrator << "Integrator action\n";
-    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
+//    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
 //    RealD Hterm;
 //    static RealD Saux=0.,Smom=0.,Sg=0.;
    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
    std::cout << GridLogMessage << "S:FieldSquareNorm H_p = " << H << "\n";
    std::cout << GridLogMessage << "S:dSField = " << H-Smom << "\n";
    Smom=H;
    P.M.ImportGauge(U);
    RealD Hterm = - P.MomentaAction();
    std::cout << GridLogMessage << "S:Momentum action H_p = " << Hterm << "\n";
    std::cout << GridLogMessage << "S:dSMom = " << Hterm-Saux << "\n";
    Saux=Hterm;
    H = Hterm;
    RealD Hterm;
    // Actions
    for (int level = 0; level < as.size(); ++level) {
@@ -496,9 +650,18 @@ public:
    std::cout << GridLogIntegrator << "Integrator initial action\n";
-    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
+//    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
-
+//    RealD Hterm;
-    RealD Hterm;
+    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
    std::cout << GridLogMessage << "S:FieldSquareNorm H_p = " << H << "\n";
    std::cout << GridLogMessage << "S:dSField = " << H-Smom << "\n";
    Smom=H;
    P.M.ImportGauge(U);
    RealD Hterm = - P.MomentaAction();
    std::cout << GridLogMessage << "S:Momentum action H_p = " << Hterm << "\n";
    std::cout << GridLogMessage << "S:dSMom = " << Hterm-Saux << "\n";
    Saux=Hterm;
    H = Hterm;
    // Actions
    for (int level = 0; level < as.size(); ++level) {
@@ -521,7 +684,7 @@ public:
  }
-  void integrate(Field& U) 
+  void integrate(Field& U, int traj=-1 ) 
  {
    // reset the clocks
    t_U = 0;
@@ -533,6 +696,12 @@ public:
      int first_step = (stp == 0);
      int last_step = (stp == Params.MDsteps - 1);
      this->step(U, 0, first_step, last_step);
      if (traj>=0){
        std::string file("./config."+std::to_string(traj)+"_"+std::to_string(stp+1) );
        int precision32 = 0;
        int tworow      = 0;
        NerscIO::writeConfiguration(U,file,tworow,precision32);
      }
    }
    // Check the clocks all match on all levels
@@ -542,7 +711,6 @@ public:
    }
    FieldImplementation::Project(U);
    // and that we indeed got to the end of the trajectory
    assert(fabs(t_U - Params.trajL) < 1.0e-6);
--- a/Grid/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/Grid/qcd/hmc/integrators/Integrator_algorithm.h
@@ -102,8 +102,8 @@ public:
  std::string integrator_name(){return "LeapFrog";}
-  LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
+  LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){};
+    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){};
  void step(Field& U, int level, int _first, int _last) {
    int fl = this->as.size() - 1;
@@ -140,14 +140,14 @@ template <class FieldImplementation_, class SmearingPolicy, class Representation
 class MinimumNorm2 : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy> 
 {
 private:
-  const RealD lambda = 0.1931833275037836;
+//  const RealD lambda = 0.1931833275037836;
 public:
  typedef FieldImplementation_ FieldImplementation;
  INHERIT_FIELD_TYPES(FieldImplementation);
-  MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
+  MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
-    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){};
+    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){};
  std::string integrator_name(){return "MininumNorm2";}
@@ -155,6 +155,11 @@ public:
    // level  : current level
    // fl     : final level
    // eps    : current step size
    assert(level<3);
    RealD lambda= this->Params.lambda0;
    if (level>0) lambda= this->Params.lambda1;
    if (level>1) lambda= this->Params.lambda2;
    std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl;
    int fl = this->as.size() - 1;
@@ -210,9 +215,9 @@ public:
  // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
  ForceGradient(GridBase* grid, IntegratorParameters Par,
                ActionSet<Field, RepresentationPolicy>& Aset,
-                SmearingPolicy& Sm)
+                SmearingPolicy& Sm, Metric<Field>& M)
    : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
-									    grid, Par, Aset, Sm){};
+									    grid, Par, Aset, Sm,M){};
  std::string integrator_name(){return "ForceGradient";}
@@ -275,6 +280,255 @@ public:
  }
 };
 ////////////////////////////////
 // Riemannian Manifold HMC
 // Girolami et al
 ////////////////////////////////
 // correct
 template <class FieldImplementation, class SmearingPolicy,
          class RepresentationPolicy =
              Representations<FundamentalRepresentation> >
 class ImplicitLeapFrog : public Integrator<FieldImplementation, SmearingPolicy,
                                           RepresentationPolicy> {
 public:
  typedef ImplicitLeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy>
      Algorithm;
  INHERIT_FIELD_TYPES(FieldImplementation);
  // Riemannian manifold metric operator
  // Hermitian operator Fisher
  std::string integrator_name(){return "ImplicitLeapFrog";}
  ImplicitLeapFrog(GridBase* grid, IntegratorParameters Par,
           ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
            grid, Par, Aset, Sm, M){};
  void step(Field& U, int level, int _first, int _last) {
    int fl = this->as.size() - 1;
    // level  : current level
    // fl     : final level
    // eps    : current step size
    // Get current level step size
    RealD eps = this->Params.trajL/this->Params.MDsteps;
    for (int l = 0; l <= level; ++l) eps /= this->as[l].multiplier;
    int multiplier = this->as[level].multiplier;
    for (int e = 0; e < multiplier; ++e) {
      int first_step = _first && (e == 0);
      int last_step = _last && (e == multiplier - 1);
      if (first_step) {  // initial half step
       this->implicit_update_P(U, level, eps / 2.0);
      }
      if (level == fl) {  // lowest level
        this->implicit_update_U(U, eps,eps/2.);
      } else {  // recursive function call
        this->step(U, level + 1, first_step, last_step);
      }
      //int mm = last_step ? 1 : 2;
      if (last_step){
        this->update_P2(U, level, eps / 2.0);
      } else {
      this->implicit_update_P(U, level, eps, true);// works intermediate step
      }
    }
  }
 };
 template <class FieldImplementation, class SmearingPolicy,
          class RepresentationPolicy =
              Representations<FundamentalRepresentation> >
 class ImplicitMinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy,
                                       RepresentationPolicy> {
 private:
 //  const RealD lambda = 0.1931833275037836;
 public:
  INHERIT_FIELD_TYPES(FieldImplementation);
  ImplicitMinimumNorm2(GridBase* grid, IntegratorParameters Par,
               ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
            grid, Par, Aset, Sm, M){};
  std::string integrator_name(){return "ImplicitMininumNorm2";}
  void step(Field& U, int level, int _first, int _last) {
    // level  : current level
    // fl     : final level
    // eps    : current step size
    int fl = this->as.size() - 1;
 //    assert(Params.lambda.size()>level);
 //    RealD lambda= Params.lambda[level];
    assert(level<3);
    RealD lambda= this->Params.lambda0;
    if (level>0) lambda= this->Params.lambda1;
    if (level>1) lambda= this->Params.lambda2;
    std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl;
  if(level<fl){
    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
    // Nesting:  2xupdate_U of size eps/2
    // Next level is eps/2/multiplier
    int multiplier = this->as[level].multiplier;
    for (int e = 0; e < multiplier; ++e) {  // steps per step
      int first_step = _first && (e == 0);
      int last_step = _last && (e == multiplier - 1);
      if (first_step) {  // initial half step
        this->update_P(U, level, lambda * eps);
      }
        this->step(U, level + 1, first_step, 0);
      this->update_P(U, level, (1.0 - 2.0 * lambda) * eps);
        this->step(U, level + 1, 0, last_step);
      int mm = (last_step) ? 1 : 2;
      this->update_P(U, level, lambda * eps * mm);
    }
  } 
  else 
  { // last level
    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
    // Nesting:  2xupdate_U of size eps/2
    // Next level is eps/2/multiplier
    int multiplier = this->as[level].multiplier;
    for (int e = 0; e < multiplier; ++e) {  // steps per step
      int first_step = _first && (e == 0);
      int last_step = _last && (e == multiplier - 1);
      if (first_step) {  // initial half step
        this->implicit_update_P(U, level, lambda * eps);
      }
      this->implicit_update_U(U, 0.5 * eps,lambda*eps);
      this->implicit_update_P(U, level, (1.0 - 2.0 * lambda) * eps, true);
      this->implicit_update_U(U, 0.5 * eps, (0.5-lambda)*eps);
      if (last_step) {
        this->update_P2(U, level, eps * lambda);
      } else {
        this->implicit_update_P(U, level, lambda * eps*2.0, true);
      }
    }
  }
  }
 };
 template <class FieldImplementation, class SmearingPolicy,
          class RepresentationPolicy =
              Representations<FundamentalRepresentation> >
 class ImplicitCampostrini : public Integrator<FieldImplementation, SmearingPolicy,
                                       RepresentationPolicy> {
 private:
 //  const RealD lambda = 0.1931833275037836;
 public:
  INHERIT_FIELD_TYPES(FieldImplementation);
  ImplicitCampostrini(GridBase* grid, IntegratorParameters Par,
               ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M)
      : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(
            grid, Par, Aset, Sm, M){};
  std::string integrator_name(){return "ImplicitCampostrini";}
  void step(Field& U, int level, int _first, int _last) {
    // level  : current level
    // fl     : final level
    // eps    : current step size
    int fl = this->as.size() - 1;
 //    assert(Params.lambda.size()>level);
 //    RealD lambda= Params.lambda[level];
    assert(level<3);
    RealD lambda= this->Params.lambda0;
    if (level>0) lambda= this->Params.lambda1;
    if (level>1) lambda= this->Params.lambda2;
    std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl;
    RealD sigma=pow(2.0,1./3.);
  if(level<fl){
 //Still Omelyan. Needs to change step() to accept variable stepsize
    RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0;
    for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier;
    // Nesting:  2xupdate_U of size eps/2
    // Next level is eps/2/multiplier
    int multiplier = this->as[level].multiplier;
    for (int e = 0; e < multiplier; ++e) {  // steps per step
      int first_step = _first && (e == 0);
      int last_step = _last && (e == multiplier - 1);
      if (first_step) {  // initial half step
        this->update_P(U, level, lambda * eps);
      }
        this->step(U, level + 1, first_step, 0);
      this->update_P(U, level, (1.0 - 2.0 * lambda) * eps);
        this->step(U, level + 1, 0, last_step);
      int mm = (last_step) ? 1 : 2;
      this->update_P(U, level, lambda * eps * mm);
    }
  } 
  else 
  { // last level
    RealD dt = this->Params.trajL/this->Params.MDsteps * 2.0;
    for (int l = 0; l <= level; ++l) dt /= 2.0 * this->as[l].multiplier;
    RealD epsilon = dt/(2.0 - sigma);
    int multiplier = this->as[level].multiplier;
    for (int e = 0; e < multiplier; ++e) {  // steps per step
      int first_step = _first && (e == 0);
      int last_step = _last && (e == multiplier - 1);
      // initial half step
      if (first_step) {  this->implicit_update_P(U, level, epsilon*0.5); }
      this->implicit_update_U(U, epsilon,epsilon*0.5);
      this->implicit_update_P(U, level, (1.0 - sigma) * epsilon *0.5, epsilon*0.5, true);
      this->implicit_update_U(U, -epsilon*sigma, -epsilon*sigma*0.5);
      this->implicit_update_P(U, level, (1.0 - sigma) * epsilon *0.5, -epsilon*sigma*0.5, true);
      this->implicit_update_U(U, epsilon,epsilon*0.5);
      if (last_step) { this->update_P2(U, level, epsilon*0.5 ); } 
      else
      this->implicit_update_P(U, level, epsilon,epsilon*0.5);
    }
  }
  }
 };
 NAMESPACE_END(Grid);
 #endif  // INTEGRATOR_INCLUDED
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -1,4 +1,3 @@
 /*!
  @file GaugeConfiguration.h
  @brief Declares the GaugeConfiguration class
@@ -7,15 +6,6 @@
 NAMESPACE_BEGIN(Grid);
 template<class T> void Dump(const Lattice<T> & lat,
 			    std::string s,
 			    Coordinate site = Coordinate({0,0,0,0}))
 {
  typename T::scalar_object tmp;
  peekSite(tmp,lat,site);
  std::cout << " Dump "<<s<<" "<<tmp<<std::endl;
 }
 /*!
  @brief Smeared configuration masked container
  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
@@ -38,101 +28,6 @@ private:
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;
  void BaseSmearDerivative(GaugeField& SigmaTerm,
 			   const GaugeField& iLambda,
 			   const GaugeField& U,
 			   int mmu, RealD rho)
  {
    // Reference
    // Morningstar, Peardon, Phys.Rev.D69,054501(2004)
    // Equation 75
    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
    // Output SigmaTerm
    GridBase *grid = U.Grid();
    WilsonLoops<Gimpl> WL;
    GaugeLinkField staple(grid), u_tmp(grid);
    GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
    GaugeLinkField U_mu(grid), U_nu(grid);
    GaugeLinkField sh_field(grid), temp_Sigma(grid);
    Real rho_munu, rho_numu;
    rho_munu = rho;
    rho_numu = rho;
    for(int mu = 0; mu < Nd; ++mu){
      U_mu       = peekLorentz(      U, mu);
      iLambda_mu = peekLorentz(iLambda, mu);
      for(int nu = 0; nu < Nd; ++nu){
 	if(nu==mu) continue;
 	U_nu       = peekLorentz(      U, nu);
 	// Nd(nd-1) = 12 staples normally.
 	// We must compute 6 of these
 	// in FTHMC case
 	if ( (mu==mmu)||(nu==mmu) )
 	  WL.StapleUpper(staple, U, mu, nu);
 	if(nu==mmu) {
 	  iLambda_nu = peekLorentz(iLambda, nu);
 	  temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
 	  //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
 	  sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
 	  temp_Sigma = rho_numu*sh_field*staple; //ok
 	  //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
 	}
 	if ( mu == mmu ) { 
 	  sh_field = Cshift(iLambda_mu, nu, 1);
 	  temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
 	  //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
 	}
 	//	staple = Zero();
 	sh_field = Cshift(U_nu, mu, 1);
 	temp_Sigma = Zero();
 	if ( mu == mmu )
 	  temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
 	if ( nu == mmu ) {
 	  temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
 	  u_tmp = adj(U_nu)*iLambda_nu;
 	  sh_field = Cshift(u_tmp, mu, 1);
 	  temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
 	}
 	sh_field = Cshift(temp_Sigma, nu, -1);
 	Gimpl::AddLink(SigmaTerm, sh_field, mu);
      }
    }
  }
  void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) {
    GridBase *grid = U.Grid();
    GaugeLinkField tmp_stpl(grid);
    WilsonLoops<Gimpl> WL;
    Cup = Zero();
    for(int nu=0; nu<Nd; ++nu){
      if (nu != mu) {
 	// get the staple in direction mu, nu
 	WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
 	Cup += adj(tmp_stpl*rho);
      }
    }
  }
  // Adjoint vector to GaugeField force
  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
  {
@@ -152,54 +47,27 @@ private:
    GaugeLinkField UtaU(PlaqL.Grid());
    GaugeLinkField D(PlaqL.Grid());
    AdjMatrixField Dbc(PlaqL.Grid());
    AdjMatrixField Dbc_opt(PlaqL.Grid());
    LatticeComplex tmp(PlaqL.Grid());
    const int Ngen = SU3Adjoint::Dimension;
    Complex ci(0,1);
    ColourMatrix   ta,tb,tc;
-    RealD t=0;
+    
    RealD tp=0;
    RealD tta=0;
    RealD tpk=0;
    t-=usecond();
    for(int a=0;a<Ngen;a++) {
      tta-=usecond();
      SU3::generator(a, ta);
      ta = 2.0 * ci * ta;
      // Qlat Tb = 2i Tb^Grid
-      UtaU= adj(PlaqL)*ta*PlaqR; // 6ms
+      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
      tta+=usecond();
      ////////////////////////////////////////////
      // Could add this entire C-loop to a projection routine
      // for performance. Could also pick checkerboard on UtaU
      // and set checkerboard on result for 2x perf
      ////////////////////////////////////////////
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
-	tc = 2.0*ci*tc;
+	D = Ta( (2.0)*ci*tc *UtaU);
 	tp-=usecond(); 
 	D = Ta( tc *UtaU); // 2ms
 #if 1
 	SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms
 #else
 	for(int b=0;b<Ngen;b++){
 	  SU3::generator(b, tb);
 	  tmp =-trace(ci*tb*D); 
 	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
 	}
 #endif
 	tp+=usecond();
      }
-      //      Dump(Dbc_opt,"Dbc_opt");
+      tmp = trace(MpInvJx * Dbc);
      //      Dump(Dbc,"Dbc");
      tpk-=usecond();
      tmp = trace(MpInvJx * Dbc_opt);
      PokeIndex<ColourIndex>(Fdet2,tmp,a);
      tpk+=usecond();
    }
    t+=usecond();
    std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms  proj "<<tp/1e3<< " ms"
 	      << " ta "<<tta/1e3<<" ms" << " poke "<<tpk/1e3<< " ms"<<std::endl;
  }
  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
@@ -211,17 +79,12 @@ private:
    ColourMatrix   tc;
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, tb);
-      tb = 2.0 * ci * tb;
+      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
      Nx = Ta( adj(PlaqL)*tb * PlaqR );
 #if 1
      SU3::LieAlgebraProject(NxAd,Nx,b);
 #else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
 	auto tmp =closure( -trace(ci*tc*Nx)); 
 	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
      }
 #endif
    }
  }
  void ApplyMask(GaugeField &U,int smr)
@@ -301,7 +164,8 @@ public:
    // Computes ALL the staples -- could compute one only and do it here
    RealD time;
    time=-usecond();
-    BaseSmear(Cmu, U,mu,rho);
+    this->StoutSmearing->BaseSmear(C, U);
    Cmu = peekLorentz(C, mu);
    //////////////////////////////////////////////////////////////////
    // Assemble Luscher exp diff map J matrix 
@@ -345,36 +209,6 @@ public:
    // dJ(x)/dxe
    //////////////////////////////////////
    time=-usecond();
 #if 1
    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
    std::vector<AdjMatrix> TRb_s; TRb_s.resize(8);
    AdjMatrixField tbXn(grid);
    AdjMatrixField sumXtbX(grid);
    AdjMatrixField t2(grid);
    AdjMatrixField dt2(grid);
    AdjMatrixField t3(grid);
    AdjMatrixField dt3(grid);
    AdjMatrixField aunit(grid);
    for(int b=0;b<8;b++){
      SU3Adjoint::generator(b, TRb_s[b]);
      dJdX[b] = TRb_s[b];
    }
    aunit = ComplexD(1.0);
    // Could put into an accelerator_for
    X  = (-1.0)*ZxAd; 
    t2 = X;
    for (int j = 12; j > 1; --j) {
      t3  = t2*(1.0 / (j + 1))  + aunit;
      t2  = X * t3;
      for(int b=0;b<8;b++){
 	dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1));
      }
    }
    for(int b=0;b<8;b++){
      dJdX[b] = -dJdX[b];
    }
 #else
    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
    AdjMatrixField tbXn(grid);
    AdjMatrixField sumXtbX(grid);
@@ -390,15 +224,14 @@ public:
      X  = (-1.0)*ZxAd; 
      t2 = X;
      dt2 = TRb;
-      for (int j = 12; j > 1; --j) {
+      for (int j = 20; j > 1; --j) {
-	t3  = t2*(1.0 / (j + 1))  + aunit;
+	t3 = t2*(1.0 / (j + 1))  + aunit;
 	dt3 = dt2*(1.0 / (j + 1));
 	t2 = X * t3;
 	dt2 = TRb * t3 + X * dt3;
      }
      dJdX[b] = -dt2; 
    }
 #endif  
    time+=usecond();
    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
    /////////////////////////////////////////////////////////////////
@@ -448,8 +281,8 @@ public:
    for(int e =0 ; e<8 ; e++){
      LatticeComplexD tr(grid);
-      //      ColourMatrix te;
+      ColourMatrix te;
-      //      SU3::generator(e, te);
+      SU3::generator(e, te);
      tr = trace(dJdX[e] * nMpInv);
      pokeColour(dJdXe_nMpInv,tr,e);
    }
@@ -660,25 +493,20 @@ public:
    //////////////////////////////////////////////////////////////////
    // Assemble the N matrix
    //////////////////////////////////////////////////////////////////
-    double rho=this->StoutSmearing->SmearRho[1];
+    // Computes ALL the staples -- could compute one only here
-    BaseSmear(Cmu, U,mu,rho);
+    this->StoutSmearing->BaseSmear(C, U);
-
+    Cmu = peekLorentz(C, mu);
    Umu = peekLorentz(U, mu);
    Complex ci(0,1);
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, Tb);
      // Qlat Tb = 2i Tb^Grid
      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
      // FIXME -- replace this with LieAlgebraProject
 #if 0
      SU3::LieAlgebraProject(Ncb,tmp,b);
 #else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, Tc);
 	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
 	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
      }
 #endif
    }      
    //////////////////////////////////////////////////////////////////
@@ -865,7 +693,7 @@ private:
 					  const GaugeField& GaugeK,int level) 
  {
    GridBase* grid = GaugeK.Grid();
-    GaugeField SigmaK(grid), iLambda(grid);
+    GaugeField C(grid), SigmaK(grid), iLambda(grid);
    GaugeField SigmaKPrimeA(grid);
    GaugeField SigmaKPrimeB(grid);
    GaugeLinkField iLambda_mu(grid);
@@ -873,11 +701,7 @@ private:
    GaugeLinkField SigmaKPrime_mu(grid);
    GaugeLinkField GaugeKmu(grid), Cmu(grid);
-    int mmu= (level/2) %Nd;
+    this->StoutSmearing->BaseSmear(C, GaugeK);
    int cb= (level%2);
    double rho=this->StoutSmearing->SmearRho[1];
    // Can override this to do one direction only.
    SigmaK = Zero();
    iLambda = Zero();
@@ -888,38 +712,18 @@ private:
    // Could get away with computing only one polarisation here
    // int mu= (smr/2) %Nd;
    // SigmaKprime_A has only one component
-#if 0
+    for (int mu = 0; mu < Nd; mu++)
    BaseSmear(Cmu, GaugeK,mu,rho);
    GaugeKmu = peekLorentz(GaugeK, mu);
    SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
    iQ = Ta(Cmu * adj(GaugeKmu));
    this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
    pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
    pokeLorentz(iLambda, iLambda_mu, mu);
    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
 #else
    //    GaugeField C(grid);
    //    this->StoutSmearing->BaseSmear(C, GaugeK);
    //    for (int mu = 0; mu < Nd; mu++)
    int mu =mmu;
    BaseSmear(Cmu, GaugeK,mu,rho);
    {
-      // Cmu = peekLorentz(C, mu);
+      Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
      iQ = Ta(Cmu * adj(GaugeKmu));
      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
      pokeLorentz(iLambda, iLambda_mu, mu);
      std::cout << " mu "<<mu<<" SigmaKPrime_mu"<<norm2(SigmaKPrime_mu)<< " iLambda_mu " <<norm2(iLambda_mu)<<std::endl;
    }
-    //    GaugeField SigmaKcopy(grid);
+    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
-    //    SigmaKcopy = SigmaK;
+
    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
    //    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
    //    SigmaKcopy = SigmaKcopy - SigmaK;
    //    std::cout << " BaseSmearDerivative fast path error" <<norm2(SigmaKcopy)<<std::endl;
 #endif
    ////////////////////////////////////////////////////////////////////////////////////
    // propagate the rest of the force as identity map, just add back
    ////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -1,389 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/smearing/HISQSmearing.h
 Copyright (C) 2023
 Author: D. A. Clarke <clarke.davida@gmail.com> 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*
    @file HISQSmearing.h
    @brief Declares classes related to HISQ smearing 
 */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 NAMESPACE_BEGIN(Grid);
 // TODO: find a way to fold this into the stencil header. need to access grid to get
 // Nd, since you don't want to inherit from QCD.h
 /*!  @brief append arbitrary shift path to shifts */
 template<typename... Args>
 void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) {
    Coordinate shift(Nd,0);
    generalShift(shift, dir, args...); 
    // push_back creates an element at the end of shifts and
    // assigns the data in the argument to it.
    shifts.push_back(shift);
 }
 /*!  @brief figure out the stencil index from mu and nu */
 accelerator_inline int stencilIndex(int mu, int nu) {
    // Nshifts depends on how you built the stencil
    int Nshifts = 6;
    return Nshifts*nu + Nd*Nshifts*mu;
 }
 /*!  @brief structure holding the link treatment */
 struct SmearingParameters{
    SmearingParameters(){}
    Real c_1;               // 1 link
    Real c_naik;            // Naik term
    Real c_3;               // 3 link
    Real c_5;               // 5 link
    Real c_7;               // 7 link
    Real c_lp;              // 5 link Lepage
    SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
        : c_1(c1),
          c_naik(cnaik),
          c_3(c3),
          c_5(c5),
          c_7(c7),
          c_lp(clp){}
 };
 /*!  @brief create fat links from link variables */
 template<class Gimpl> 
 class Smear_HISQ : public Gimpl {
 private:
    GridCartesian* const _grid;
    SmearingParameters _linkTreatment;
 public:
    INHERIT_GIMPL_TYPES(Gimpl);
    typedef typename Gimpl::GaugeField     GF;
    typedef typename Gimpl::GaugeLinkField LF;
    typedef typename Gimpl::ComplexField   CF;
    // Don't allow default values here.
    Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
        : _grid(grid), 
          _linkTreatment(c1,cnaik,c3,c5,c7,clp) {
        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
    }
    // Allow to pass a pointer to a C-style, double array for MILC convenience
    Smear_HISQ(GridCartesian* grid, double* coeff) 
        : _grid(grid), 
          _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
    }
    ~Smear_HISQ() {}
    // Intent: OUT--u_smr, u_naik
    //          IN--u_thin
    void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
        SmearingParameters lt = this->_linkTreatment;
        auto grid = this->_grid;
        // Create a padded cell of extra padding depth=1 and fill the padding.
        int depth = 1;
        PaddedCell Ghost(depth,grid);
        GF Ughost = Ghost.Exchange(u_thin);
        // This is where auxiliary N-link fields and the final smear will be stored. 
        GF Ughost_fat(Ughost.Grid());
        GF Ughost_3link(Ughost.Grid());
        GF Ughost_5linkA(Ughost.Grid());
        GF Ughost_5linkB(Ughost.Grid());
        // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier,
        // but these entries will not be used. 
        std::vector<Coordinate> shifts;
        for(int mu=0;mu<Nd;mu++)
        for(int nu=0;nu<Nd;nu++) {
            appendShift(shifts,mu);
            appendShift(shifts,nu);
            appendShift(shifts,shiftSignal::NO_SHIFT);
            appendShift(shifts,mu,Back(nu));
            appendShift(shifts,Back(nu));
            appendShift(shifts,Back(mu));
        }
        // A GeneralLocalStencil has two indices: a site and stencil index 
        GeneralLocalStencil gStencil(Ughost.Grid(),shifts);
        // This is where contributions from the smearing get added together
        Ughost_fat=Zero();
        // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik.
        for(int mu=0;mu<Nd;mu++) {
            // TODO: This approach is slightly memory inefficient. It uses 25% extra memory 
            Ughost_3link =Zero();
            Ughost_5linkA=Zero();
            Ughost_5linkB=Zero();
            // Create the accessors
            autoView(U_v       , Ughost       , AcceleratorRead);
            autoView(U_fat_v   , Ughost_fat   , AcceleratorWrite);
            autoView(U_3link_v , Ughost_3link , AcceleratorWrite);
            autoView(U_5linkA_v, Ughost_5linkA, AcceleratorWrite);
            autoView(U_5linkB_v, Ughost_5linkB, AcceleratorWrite);
            // We infer some types that will be needed in the calculation.
            typedef decltype(gStencil.GetEntry(0,0)) stencilElement;
            typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix;
            int Nsites = U_v.size();
            auto gStencil_v = gStencil.View(); 
            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs
                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
                U3matrix U0, U1, U2, U3, U4, U5, W;
                for(int nu=0;nu<Nd;nu++) {
                    if(nu==mu) continue;
                    int s = stencilIndex(mu,nu);
                    // The stencil gives us support points in the mu-nu plane that we will use to
                    // grab the links we need.
                    SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
                    SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
                    SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
                    SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
                    SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
                    SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu      = SE5->_offset;
                    // When you're deciding whether to take an adjoint, the question is: how is the
                    // stored link oriented compared to the one you want? If I imagine myself travelling
                    // with the to-be-updated link, I have two possible, alternative 3-link paths I can
                    // take, one starting by going to the left, the other starting by going to the right.
                    U0 = coalescedReadGeneralPermute(U_v[x_p_mu     ](nu),SE0->_permute,Nd);
                    U1 = coalescedReadGeneralPermute(U_v[x_p_nu     ](mu),SE1->_permute,Nd);
                    U2 = coalescedReadGeneralPermute(U_v[x          ](nu),SE2->_permute,Nd);
                    U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
                    U4 = coalescedReadGeneralPermute(U_v[x_m_nu     ](mu),SE4->_permute,Nd);
                    U5 = coalescedReadGeneralPermute(U_v[x_m_nu     ](nu),SE4->_permute,Nd);
                    //  "left"          "right"
                    W = U2*U1*adj(U0) + adj(U5)*U4*U3;
                    // Save 3-link construct for later and add to smeared field.
                    coalescedWrite(U_3link_v[x](nu), W);
                    // The index operator (x) returns the coalesced read on GPU. The view [] index returns 
                    // a reference to the vector object. The [x](mu) returns a reference to the densely 
                    // packed (contiguous in memory) mu-th element of the vector object. On CPU, 
                    // coalescedRead/Write is the identity mapping assigning vector object to vector object.
                    // But on GPU it's non-trivial and maps scalar object to vector object and vice versa.
                    coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W);
                }
            })
            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link 
                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
                U3matrix U0, U1, U2, U3, U4, U5, W;
                int sigmaIndex = 0;
                for(int nu=0;nu<Nd;nu++) {
                    if(nu==mu) continue;
                    int s = stencilIndex(mu,nu);
                    for(int rho=0;rho<Nd;rho++) {
                        if (rho == mu || rho == nu) continue;
                        SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
                        SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
                        SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
                        SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
                        SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
                        U0 = coalescedReadGeneralPermute(      U_v[x_p_mu     ](nu ),SE0->_permute,Nd);
                        U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu     ](rho),SE1->_permute,Nd);
                        U2 = coalescedReadGeneralPermute(      U_v[x          ](nu ),SE2->_permute,Nd);
                        U3 = coalescedReadGeneralPermute(      U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd);
                        U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu     ](rho),SE4->_permute,Nd);
                        U5 = coalescedReadGeneralPermute(      U_v[x_m_nu     ](nu ),SE4->_permute,Nd);
                        W  = U2*U1*adj(U0) + adj(U5)*U4*U3;
                        if(sigmaIndex<3) {
                            coalescedWrite(U_5linkA_v[x](rho), W);
                        } else {
                            coalescedWrite(U_5linkB_v[x](rho), W);
                        }    
                        coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W);
                        sigmaIndex++;
                    }
                }
            })
            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link
                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
                U3matrix U0, U1, U2, U3, U4, U5, W;
                int sigmaIndex = 0;
                for(int nu=0;nu<Nd;nu++) {
                    if(nu==mu) continue;
                    int s = stencilIndex(mu,nu);
                    for(int rho=0;rho<Nd;rho++) {
                        if (rho == mu || rho == nu) continue;
                        SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
                        SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
                        SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
                        SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
                        SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
                        U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd);
                        if(sigmaIndex<3) {
                            U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd);
                        } else {
                            U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd);
                        }  
                        U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd);
                        U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
                        if(sigmaIndex<3) {
                            U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd);
                        } else {
                            U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd);
                        }  
                        U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd);
                        W  = U2*U1*adj(U0) + adj(U5)*U4*U3;
                        coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W);
                        sigmaIndex++;
                    }
                }
            })
        } // end mu loop
        // c1, c3, c5, c7 construct contributions
        u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin;
        // Load up U and V std::vectors to access thin and smeared links.
        std::vector<LF> U(Nd, grid);
        std::vector<LF> V(Nd, grid);
        std::vector<LF> Vnaik(Nd, grid);
        for (int mu = 0; mu < Nd; mu++) {
            U[mu] = PeekIndex<LorentzIndex>(u_thin, mu);
            V[mu] = PeekIndex<LorentzIndex>(u_smr, mu);
        }
        for(int mu=0;mu<Nd;mu++) {
            // Naik
            Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu,
                                    Gimpl::CovShiftForward(U[mu],mu,
                                      Gimpl::CovShiftIdentityForward(U[mu],mu)));
            // LePage
            for (int nu_h=1;nu_h<Nd;nu_h++) {
                int nu=(mu+nu_h)%Nd;
                                // nu, nu, mu, Back(nu), Back(nu)
                V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu,
                                          Gimpl::CovShiftForward(U[nu],nu,
                                            Gimpl::CovShiftForward(U[mu],mu,
                                              Gimpl::CovShiftBackward(U[nu],nu,
                                                Gimpl::CovShiftIdentityBackward(U[nu],nu)))))
                                // Back(nu), Back(nu), mu, nu, nu
                              + lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu,
                                          Gimpl::CovShiftBackward(U[nu],nu,
                                            Gimpl::CovShiftForward(U[mu],mu,
                                              Gimpl::CovShiftForward(U[nu],nu,
                                                Gimpl::CovShiftIdentityForward(U[nu],nu)))));
            }
        }
        // Put V back into u_smr.
        for (int mu = 0; mu < Nd; mu++) {
            PokeIndex<LorentzIndex>(u_smr , V[mu]    , mu);
            PokeIndex<LorentzIndex>(u_naik, Vnaik[mu], mu);
        }
    };
    // Intent: OUT--u_proj
    //          IN--u_mu
    void projectU3(GF& u_proj, GF& u_mu) const {
        auto grid = this->_grid;
        LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid);
        CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid), 
           u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid);
        // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8)
        for (int mu = 0; mu < Nd; mu++) {
            V  = PeekIndex<LorentzIndex>(u_mu, mu);
            Q  = adj(V)*V;
            c0 =        real(trace(Q));
            c1 = (1/2.)*real(trace(Q*Q));
            c2 = (1/3.)*real(trace(Q*Q*Q));
            S  = (1/3.)*c1-(1/18.)*c0*c0;
            if (norm2(S)<1e-28) {
                g0 = (1/3.)*c0; g1 = g0; g2 = g1;
            } else {
                R     = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
                theta = acos(R*pow(S,-1.5));
                g0    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
                g1    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta          );
                g2    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
            }
 //            if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD }
            u     = sqrt(g0) + sqrt(g1) + sqrt(g2);
            v     = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
            w     = sqrt(g0*g1*g2);
            den   = w*(u*v-w);
            f0    = (-w*(u*u+v)+u*v*v)/den;
            f1    = (-w-u*u*u+2.*u*v)/den;
            f2    = u/den;
            id_3  = 1.;
            sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q;
            PokeIndex<LorentzIndex>(u_proj, V*sqrtQinv, mu);
        }
    };
 //    void derivative(const GaugeField& Gauge) const {
 //    };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/smearing/Smearing.h
+++ b/Grid/qcd/smearing/Smearing.h
@@ -5,5 +5,4 @@
 #include <Grid/qcd/smearing/StoutSmearing.h>
 #include <Grid/qcd/smearing/GaugeConfiguration.h>
 #include <Grid/qcd/smearing/WilsonFlow.h>
 #include <Grid/qcd/smearing/HISQSmearing.h>
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -69,7 +69,7 @@ public:
  /*! Construct stout smearing object from explicitly specified rho matrix */
  Smear_Stout(const std::vector<double>& rho_)
    : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
-    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl;
+    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
    }
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -54,7 +54,361 @@ struct LaplacianParams : Serializable {
      precision(precision){};
 };
 #define LEG_LOAD(Dir)						 \
  SE = st.GetEntry(ptype, Dir, ss);				 \
  if (SE->_is_local ) {						 \
    int perm= SE->_permute;					 \
    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
  } else {							 \
    chi = coalescedRead(buf[SE->_offset],lane);			 \
  }								 \
  acceleratorSynchronise();
 const std::vector<int> directions4D   ({Xdir,Ydir,Zdir,Tdir,Xdir,Ydir,Zdir,Tdir});
 const std::vector<int> displacements4D({1,1,1,1,-1,-1,-1,-1});
 template<class Gimpl,class Field> class CovariantAdjointLaplacianStencil : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
 //  RealD kappa;
  typedef typename Field::vector_object siteObject;
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
  typedef CartesianStencil<siteObject, siteObject, DefaultImplParams> StencilImpl;
  GridBase *grid;
  StencilImpl Stencil;
  SimpleCompressor<siteObject> Compressor;
  DoubledGaugeField Uds;
  CovariantAdjointLaplacianStencil( GridBase *_grid)
    : grid(_grid),
      Stencil    (grid,8,Even,directions4D,displacements4D),
      Uds(grid){}
  CovariantAdjointLaplacianStencil(GaugeField &Umu)
    :
      grid(Umu.Grid()),
      Stencil    (grid,8,Even,directions4D,displacements4D),
      Uds(grid)
  { GaugeImport(Umu); }
  void GaugeImport (const GaugeField &Umu)
  {
    assert(grid == Umu.Grid());
    for (int mu = 0; mu < Nd; mu++) {
      auto U = PeekIndex<LorentzIndex>(Umu, mu);
      PokeIndex<LorentzIndex>(Uds, U, mu );
      U = adj(Cshift(U, mu, -1));
      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
    }
  };
  virtual GridBase *Grid(void) { return grid; };
 //broken
 #if 0
  virtual void  MDeriv(const Field &_left, Field &_right,Field &_der, int mu)
  {
    ///////////////////////////////////////////////
    // Halo exchange for this geometry of stencil
    ///////////////////////////////////////////////
    Stencil.HaloExchange(_lef, Compressor);
    ///////////////////////////////////
    // Arithmetic expressions
    ///////////////////////////////////
    autoView( st     , Stencil    , AcceleratorRead);
    auto buf = st.CommBuf();
    autoView( in     , _left    , AcceleratorRead);
    autoView( right    , _right   , AcceleratorRead);
    autoView( der    , _der   , AcceleratorWrite);
    autoView( U     , Uds    , AcceleratorRead);
    typedef typename Field::vector_object        vobj;
    typedef decltype(coalescedRead(left[0]))    calcObj;
    typedef decltype(coalescedRead(U[0](0))) calcLink;
    const int      Nsimd = vobj::Nsimd();
    const uint64_t NN = grid->oSites();
    accelerator_for( ss, NN, Nsimd, {
 	StencilEntry *SE;
 	const int lane=acceleratorSIMTlane(Nsimd);
 	calcObj chi;
 	calcObj phi;
 	calcObj res;
 	calcObj Uchi;
 	calcObj Utmp;
 	calcObj Utmp2;
 	calcLink UU;
 	calcLink Udag;
 	int ptype;
 	res                 = coalescedRead(def[ss]);
 	phi                 = coalescedRead(right[ss]);
 #define LEG_LOAD_MULT_LINK(leg,polarisation)			\
 	UU = coalescedRead(U[ss](polarisation));	\
 	Udag = adj(UU);					\
 	LEG_LOAD(leg);					\
 	mult(&Utmp(), &UU, &chi());			\
 	Utmp2 = adj(Utmp);				\
 	mult(&Utmp(), &UU, &Utmp2());			\
 	Utmp2 = adj(Utmp);				\
 	mult(&Uchi(), &phi(), &Utmp2());			\
 	res = res + Uchi;
 	LEG_LOAD_MULT_LINK(0,Xp);
 	LEG_LOAD_MULT_LINK(1,Yp);
 	LEG_LOAD_MULT_LINK(2,Zp);
 	LEG_LOAD_MULT_LINK(3,Tp);
 	coalescedWrite(der[ss], res,lane);
    });
  };
 #endif
  virtual void  Morig(const Field &_in, Field &_out)
  {
    ///////////////////////////////////////////////
    // Halo exchange for this geometry of stencil
    ///////////////////////////////////////////////
    Stencil.HaloExchange(_in, Compressor);
    ///////////////////////////////////
    // Arithmetic expressions
    ///////////////////////////////////
 //    auto st = Stencil.View(AcceleratorRead);
    autoView( st     , Stencil    , AcceleratorRead);
    auto buf = st.CommBuf();
    autoView( in     , _in    , AcceleratorRead);
    autoView( out    , _out   , AcceleratorWrite);
    autoView( U     , Uds    , AcceleratorRead);
    typedef typename Field::vector_object        vobj;
    typedef decltype(coalescedRead(in[0]))    calcObj;
    typedef decltype(coalescedRead(U[0](0))) calcLink;
    const int      Nsimd = vobj::Nsimd();
    const uint64_t NN = grid->oSites();
    accelerator_for( ss, NN, Nsimd, {
 	StencilEntry *SE;
 	const int lane=acceleratorSIMTlane(Nsimd);
 	calcObj chi;
 	calcObj res;
 	calcObj Uchi;
 	calcObj Utmp;
 	calcObj Utmp2;
 	calcLink UU;
 	calcLink Udag;
 	int ptype;
 	res                 = coalescedRead(in[ss])*(-8.0);
 #define LEG_LOAD_MULT(leg,polarisation)			\
 	UU = coalescedRead(U[ss](polarisation));	\
 	Udag = adj(UU);					\
 	LEG_LOAD(leg);					\
 	mult(&Utmp(), &UU, &chi());			\
 	Utmp2 = adj(Utmp);				\
 	mult(&Utmp(), &UU, &Utmp2());			\
 	Uchi = adj(Utmp);				\
 	res = res + Uchi;
 	LEG_LOAD_MULT(0,Xp);
 	LEG_LOAD_MULT(1,Yp);
 	LEG_LOAD_MULT(2,Zp);
 	LEG_LOAD_MULT(3,Tp);
 	LEG_LOAD_MULT(4,Xm);
 	LEG_LOAD_MULT(5,Ym);
 	LEG_LOAD_MULT(6,Zm);
 	LEG_LOAD_MULT(7,Tm);
 	coalescedWrite(out[ss], res,lane);
    });
  };
  virtual void  Mnew (const Field &_in, Field &_out)
  {
    ///////////////////////////////////////////////
    // Halo exchange for this geometry of stencil
    ///////////////////////////////////////////////
 //    Stencil.HaloExchange(_in, Compressor);
      std::vector<std::vector<CommsRequest_t> > requests;
      Stencil.Prepare();
  {
    GRID_TRACE("Laplace Gather");
    Stencil.HaloGather(_in,Compressor);
  }
  tracePush("Laplace Communication");
  Stencil.CommunicateBegin(requests);
  {
    GRID_TRACE("MergeSHM");
    Stencil.CommsMergeSHM(Compressor);
  }
    ///////////////////////////////////
    // Arithmetic expressions
    ///////////////////////////////////
 //    auto st = Stencil.View(AcceleratorRead);
    autoView( st     , Stencil    , AcceleratorRead);
    auto buf = st.CommBuf();
    autoView( in     , _in    , AcceleratorRead);
    autoView( out    , _out   , AcceleratorWrite);
    autoView( U     , Uds    , AcceleratorRead);
    typedef typename Field::vector_object        vobj;
    typedef decltype(coalescedRead(in[0]))    calcObj;
    typedef decltype(coalescedRead(U[0](0))) calcLink;
    const int      Nsimd = vobj::Nsimd();
    const uint64_t NN = grid->oSites();
    accelerator_for( ss, NN, Nsimd, {
 	StencilEntry *SE;
 	const int lane=acceleratorSIMTlane(Nsimd);
 	calcObj chi;
 	calcObj res;
 	calcObj Uchi;
 	calcObj Utmp;
 	calcObj Utmp2;
 	calcLink UU;
 	calcLink Udag;
 	int ptype;
 	res                 = coalescedRead(in[ss])*(-8.0);
        SE = st.GetEntry(ptype, 0, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(0,Xp);
 	}
        SE = st.GetEntry(ptype, 1, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(1,Yp);
 	}
        SE = st.GetEntry(ptype, 2, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(2,Zp);
 	}
        SE = st.GetEntry(ptype, 3, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(3,Tp);
 	}
        SE = st.GetEntry(ptype, 4, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(4,Xm);
 	}
        SE = st.GetEntry(ptype, 5, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(5,Ym);
 	}
        SE = st.GetEntry(ptype, 6, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(6,Zm);
 	}
        SE = st.GetEntry(ptype, 7, ss);				 
        if (SE->_is_local ) {
 	LEG_LOAD_MULT(7,Tm);
 	}
 	coalescedWrite(out[ss], res,lane);
    });
    Stencil.CommunicateComplete(requests);
  tracePop("Communication");
  {
    GRID_TRACE("Merge");
    Stencil.CommsMerge(Compressor);
  }
    accelerator_for( ss, NN, Nsimd, {
 	StencilEntry *SE;
 	const int lane=acceleratorSIMTlane(Nsimd);
 	calcObj chi;
 	calcObj res;
 	calcObj Uchi;
 	calcObj Utmp;
 	calcObj Utmp2;
 	calcLink UU;
 	calcLink Udag;
 	int ptype;
 //	res                 = coalescedRead(in[ss])*(-8.0);
 	res                 = coalescedRead(out[ss]);
        SE = st.GetEntry(ptype, 0, ss);				 
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(0,Xp);
 	}
        SE = st.GetEntry(ptype, 1, ss);				 
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(1,Yp);
 	}
        SE = st.GetEntry(ptype, 2, ss);				 
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(2,Zp);
 	}
        SE = st.GetEntry(ptype, 3, ss);
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(3,Tp);
 	}
        SE = st.GetEntry(ptype, 4, ss);
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(4,Xm);
 	}
        SE = st.GetEntry(ptype, 5, ss);
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(5,Ym);
 	}
        SE = st.GetEntry(ptype, 6, ss);
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(6,Zm);
 	}
        SE = st.GetEntry(ptype, 7, ss);
        if ((SE->_is_local )==0){
 	LEG_LOAD_MULT(7,Tm);
 	}
 	coalescedWrite(out[ss], res,lane);
    });
  };
  virtual void  M(const Field &in, Field &out) {Mnew(in,out);};
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 #undef LEG_LOAD_MULT
 #undef LEG_LOAD_MULT_LINK
 #undef LEG_LOAD
 ////////////////////////////////////////////////////////////
 // Laplacian operator L on adjoint fields
@@ -76,29 +430,40 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  LaplacianParams param;
  MultiShiftFunction PowerHalf;    
  MultiShiftFunction PowerInvHalf;    
 //template<class Gimpl,class Field> class CovariantAdjointLaplacianStencil : public SparseMatrixBase<Field>
  CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField> LapStencil;
 public:
  INHERIT_GIMPL_TYPES(Impl);
-  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0)
+  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0, bool if_remez=true)
-    : U(Nd, grid), Solver(S), param(p), kappa(k){
+    : U(Nd, grid), Solver(S), param(p), kappa(k)
 	,LapStencil(grid){
    AlgRemez remez(param.lo,param.hi,param.precision);
    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
    if(if_remez){
    remez.generateApprox(param.degree,1,2);
    PowerHalf.Init(remez,param.tolerance,false);
    PowerInvHalf.Init(remez,param.tolerance,true);
    }
    this->triv=0;
  };
-
+  LaplacianAdjointField(){this->triv=0; printf("triv=%d\n",this->Trivial());}
  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
  void ImportGauge(const GaugeField& _U) {
    RealD total=0.;
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
      total += norm2(U[mu]);
    }
    LapStencil.GaugeImport (_U);
    std::cout << GridLogDebug <<"ImportGauge:norm2(U _U) = "<<total<<std::endl;
  }
  void M(const GaugeField& in, GaugeField& out) {
@@ -106,10 +471,12 @@ public:
    // test
    //GaugeField herm = in + adj(in);
    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;
 //    std::cout << GridLogDebug <<"M:Kappa = "<<kappa<<std::endl;
    GaugeLinkField sum(in.Grid());
 #if 0
    GaugeLinkField tmp(in.Grid());
    GaugeLinkField tmp2(in.Grid());
    GaugeLinkField sum(in.Grid());
    for (int nu = 0; nu < Nd; nu++) {
      sum = Zero();
@@ -123,10 +490,22 @@ public:
      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
      PokeIndex<LorentzIndex>(out, out_nu, nu);
    }
 #else
    for (int nu = 0; nu < Nd; nu++) {
      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
      GaugeLinkField out_nu(out.Grid());
      LapStencil.M(in_nu,sum);
      out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum;
      PokeIndex<LorentzIndex>(out, out_nu, nu);
    }
 #endif
 //    std::cout << GridLogDebug <<"M:norm2(out) = "<<norm2(out)<<std::endl;
  }
  void MDeriv(const GaugeField& in, GaugeField& der) {
    // in is anti-hermitian
 //    std::cout << GridLogDebug <<"MDeriv:Kappa = "<<kappa<<std::endl;
    RealD factor = -kappa / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++){
@@ -140,6 +519,7 @@ public:
      // adjoint in the last multiplication
      PokeIndex<LorentzIndex>(der,  -2.0 * factor * der_mu, mu);
    } 
    std::cout << GridLogDebug <<"MDeriv: Kappa= "<< kappa << " norm2(der) = "<<norm2(der)<<std::endl;
  }
  // separating this temporarily
@@ -159,11 +539,22 @@ public:
      }
      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
    }
    std::cout << GridLogDebug <<"MDeriv: Kappa= "<< kappa << " norm2(der) = "<<norm2(der)<<std::endl;
  }
  void Minv(const GaugeField& in, GaugeField& inverted){
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    Solver(HermOp, in, inverted);
    std::cout << GridLogDebug <<"Minv:norm2(inverted) = "<<norm2(inverted)<<std::endl;
  }
  void MinvDeriv(const GaugeField& in, GaugeField& der) {
    GaugeField X(in.Grid());
    Minv(in,X);
    MDeriv(X,der);
    der *=-1.0;
    std::cout << GridLogDebug <<"MinvDeriv:norm2(der) = "<<norm2(der)<<std::endl;
  }
  void MSquareRoot(GaugeField& P){
@@ -172,6 +563,7 @@ public:
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf);
    msCG(HermOp,P,Gp);
    P = Gp; 
    std::cout << GridLogDebug <<"MSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
  }
  void MInvSquareRoot(GaugeField& P){
@@ -180,6 +572,7 @@ public:
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf);
    msCG(HermOp,P,Gp);
    P = Gp; 
    std::cout << GridLogDebug <<"MInvSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
  }
--- a/Grid/qcd/utils/CovariantLaplacianRat.h
+++ b/Grid/qcd/utils/CovariantLaplacianRat.h
@@ -0,0 +1,403 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/scalar/CovariantLaplacianRat.h
 Copyright (C) 2021
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once 
 #define MIXED_CG
 //enable/disable push_back
 #undef USE_CHRONO 
 //#include <roctracer/roctx.h>
 NAMESPACE_BEGIN(Grid);
 struct LaplacianRatParams {
  RealD offset;
  int order;
  std::vector<RealD> a0;
  std::vector<RealD> a1;
  std::vector<RealD> b0;
  std::vector<RealD> b1;
  RealD b2; //for debugging
  int   MaxIter;
  RealD tolerance;
  int   precision;
  // constructor 
  LaplacianRatParams(int ord = 1,
                  int maxit     = 1000,
                  RealD tol     = 1.0e-8, 
                  int precision = 64)
    : offset(1.), order(ord),b2(1.),
      MaxIter(maxit),
      tolerance(tol),
      precision(precision){ 
      a0.resize(ord,0.);
      a1.resize(ord,0.);
      b0.resize(ord,0.);
      b1.resize(ord,0.);
      };
 };
 ////////////////////////////////////////////////////////////
 // Laplacian operator L on adjoint fields
 //
 // phi: adjoint field
 // L: D_mu^dag D_mu
 //
 // L phi(x) = Sum_mu [ U_mu(x)phi(x+mu)U_mu(x)^dag + 
 //                     U_mu(x-mu)^dag phi(x-mu)U_mu(x-mu)
 //                     -2phi(x)]
 //
 // Operator designed to be encapsulated by
 // an HermitianLinearOperator<.. , ..>
 ////////////////////////////////////////////////////////////
 template <class Impl, class ImplF>
 class LaplacianAdjointRat: public Metric<typename Impl::Field> {
  OperatorFunction<typename Impl::Field> &Solver;
  LaplacianRatParams Gparam;
  LaplacianRatParams Mparam;
  GridBase *grid;
  GridBase *grid_f;
  CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField> LapStencil;
  CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField> LapStencilF;
 public:
  INHERIT_GIMPL_TYPES(Impl);
 //   typedef typename GImpl::LinkField GaugeLinkField; \
 //  typedef typename GImpl::Field GaugeField;         
  typedef typename ImplF::Field GaugeFieldF;
  typedef typename ImplF::LinkField GaugeLinkFieldF; \
  GaugeField Usav;
  GaugeFieldF UsavF;
  std::vector< std::vector<GaugeLinkField> > prev_solnsM;
  std::vector< std::vector<GaugeLinkField> > prev_solnsMinv;
  std::vector< std::vector<GaugeLinkField> > prev_solnsMDeriv;
  std::vector< std::vector<GaugeLinkField> > prev_solnsMinvDeriv;
 	  LaplacianAdjointRat(GridBase* _grid, GridBase* _grid_f, OperatorFunction<GaugeField>& S, LaplacianRatParams& gpar, LaplacianRatParams& mpar)
    : grid(_grid),grid_f(_grid_f), LapStencil(_grid), LapStencilF(_grid_f), U(Nd, _grid), Solver(S), Gparam(gpar), Mparam(mpar),Usav(_grid), UsavF(_grid_f),
      prev_solnsM(4),prev_solnsMinv(4),prev_solnsMDeriv(4),prev_solnsMinvDeriv(4) {
 //    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
    this->triv=0;
  };
  LaplacianAdjointRat(){this->triv=0; printf("triv=%d\n",this->Trivial());}
  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
  void ImportGauge(const GaugeField& _U) {
    RealD total=0.;
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(_U, mu);
      total += norm2(U[mu]);
    }
    Usav = _U;
    precisionChange(UsavF,Usav);
    std::cout <<GridLogDebug << "ImportGauge:norm2(_U) = "<<" "<<total<<std::endl;
  }
  void MDerivLink(const GaugeLinkField& left, const GaugeLinkField& right,
              GaugeField& der) {
    std::cout<<GridLogMessage << "MDerivLink start "<< std::endl;
    RealD factor = -1. / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++) {
      GaugeLinkField der_mu(der.Grid());
      der_mu = Zero();
 //      for (int nu = 0; nu < Nd; nu++) {
 //        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
 //        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
        der_mu += U[mu] * Cshift(left, mu, 1) * adj(U[mu]) * right;
        der_mu += U[mu] * Cshift(right, mu, 1) * adj(U[mu]) * left;
 //      }
      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
    }
 //    std::cout << GridLogDebug <<"MDerivLink:  norm2(der) = "<<norm2(der)<<std::endl;
    std::cout<<GridLogMessage << "MDerivLink end "<< std::endl;
  }
  void MDerivLink(const GaugeLinkField& left, const GaugeLinkField& right,
              std::vector<GaugeLinkField> & der) {
 //    std::cout<<GridLogMessage << "MDerivLink "<< std::endl;
    RealD factor = -1. / (double(4 * Nd));
    for (int mu = 0; mu < Nd; mu++) {
      GaugeLinkField der_mu(left.Grid());
      der_mu = Zero();
        der_mu += U[mu] * Cshift(left, mu, 1) * adj(U[mu]) * right;
        der_mu += U[mu] * Cshift(right, mu, 1) * adj(U[mu]) * left;
 //      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
      der[mu] = -factor*der_mu;
 //      std::cout << GridLogDebug <<"MDerivLink:  norm2(der) = "<<norm2(der[mu])<<std::endl;
    }
 //    std::cout<<GridLogMessage << "MDerivLink end "<< std::endl;
  }
  void MDerivInt(LaplacianRatParams &par, const GaugeField& left, const GaugeField& right,
              GaugeField& der ,  std::vector< std::vector<GaugeLinkField> >& prev_solns ) {
 // get rid of this please
    std::cout<<GridLogMessage << "LaplaceStart " <<std::endl;
    RealD fac =  - 1. / (double(4 * Nd)) ;
    RealD coef=0.5;
    LapStencil.GaugeImport(Usav);
    LapStencilF.GaugeImport(UsavF);
    for (int nu=0;nu<Nd;nu++){
        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
        GaugeLinkField LMinvMom(left.Grid());
        GaugeLinkField GMom(left.Grid());
        GaugeLinkField LMinvGMom(left.Grid());
        GaugeLinkField AGMom(left.Grid());
        GaugeLinkField MinvAGMom(left.Grid());
        GaugeLinkField LMinvAGMom(left.Grid());
        GaugeLinkField AMinvMom(left.Grid());
        GaugeLinkField LMinvAMom(left.Grid());
        GaugeLinkField temp(left.Grid());
        GaugeLinkField temp2(left.Grid());
        std::vector<GaugeLinkField> MinvMom(par.order,left.Grid());
        GaugeLinkField MinvGMom(left.Grid());
        GaugeLinkField Gtemp(left.Grid());
        GaugeLinkField Gtemp2(left.Grid());
        ConjugateGradient<GaugeLinkField> CG(par.tolerance,10000,false);
    //    ConjugateGradient<GaugeFieldF> CG_f(par.tolerance,10000,false);
        LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64);
        ChronoForecast< QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,GaugeLinkField>,GaugeLinkField> , GaugeLinkField> Forecast;
        GMom = par.offset * right_nu;
        for(int i =0;i<par.order;i++){
        QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
 #if USE_CHRONO
        MinvMom[i] = Forecast(QuadOp, right_nu, prev_solns[nu]);
 #endif
 #ifndef MIXED_CG
        CG(QuadOp,right_nu,MinvMom[i]);
 #else
        QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
    //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
        MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
        MixedCG.InnerTolerance=par.tolerance;
        MixedCG(right_nu,MinvMom[i]);
    #endif
    #if USE_CHRONO
        prev_solns[nu].push_back(MinvMom[i]);
    #endif
        GMom += par.a0[i]*MinvMom[i]; 
        LapStencil.M(MinvMom[i],Gtemp2);
        GMom += par.a1[i]*fac*Gtemp2; 
        }
        for(int i =0;i<par.order;i++){
        QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
        MinvGMom = Forecast(QuadOp, GMom, prev_solns[nu]);
    #ifndef MIXED_CG
        CG(QuadOp,GMom,MinvGMom);
        LapStencil.M(MinvGMom, Gtemp2); LMinvGMom=fac*Gtemp2;
        CG(QuadOp,right_nu,MinvMom[i]);
    #else
        QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
    //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
        MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
        MixedCG.InnerTolerance=par.tolerance;
        MixedCG(GMom,MinvGMom);
        LapStencil.M(MinvGMom, Gtemp2); LMinvGMom=fac*Gtemp2;
    //    Laplacian.M(MinvGMom, LMinvGMom);
        MixedCG(right_nu,MinvMom[i]);
    #endif
 #if USE_CHRONO
        prev_solns[nu].push_back(MinvGMom);
 #endif
        LapStencil.M(MinvMom[i], Gtemp2); LMinvMom=fac*Gtemp2;
        AMinvMom = par.a1[i]*LMinvMom;
        AMinvMom += par.a0[i]*MinvMom[i];
        LapStencil.M(AMinvMom, Gtemp2); LMinvAMom=fac*Gtemp2;
        LapStencil.M(MinvGMom, Gtemp2); temp=fac*Gtemp2;
        MinvAGMom = par.a1[i]*temp;
        MinvAGMom += par.a0[i]*MinvGMom;
        LapStencil.M(MinvAGMom, Gtemp2); LMinvAGMom=fac*Gtemp2;
        GaugeField tempDer(left.Grid());
        std::vector<GaugeLinkField> DerLink(Nd,left.Grid());
        std::vector<GaugeLinkField> tempDerLink(Nd,left.Grid());
        std::cout<<GridLogMessage << "force contraction "<< i <<std::endl;
    //    roctxRangePushA("RMHMC force contraction");
 #if 0
        MDerivLink(GMom,MinvMom[i],tempDer); der += coef*2*par.a1[i]*tempDer;
        MDerivLink(left_nu,MinvGMom,tempDer); der += coef*2*par.a1[i]*tempDer;
        MDerivLink(LMinvAGMom,MinvMom[i],tempDer); der += coef*-2.*par.b2*tempDer;
        MDerivLink(LMinvAMom,MinvGMom,tempDer); der += coef*-2.*par.b2*tempDer;
        MDerivLink(MinvAGMom,LMinvMom,tempDer); der += coef*-2.*par.b2*tempDer;
        MDerivLink(AMinvMom,LMinvGMom,tempDer); der += coef*-2.*par.b2*tempDer;
        MDerivLink(MinvAGMom,MinvMom[i],tempDer); der += coef*-2.*par.b1[i]*tempDer;
        MDerivLink(AMinvMom,MinvGMom,tempDer); der += coef*-2.*par.b1[i]*tempDer;
 #else
 	for (int mu=0;mu<Nd;mu++) DerLink[mu]=Zero();
        MDerivLink(GMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*2*par.a1[i]*tempDerLink[mu];
        MDerivLink(left_nu,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*2*par.a1[i]*tempDerLink[mu];
        MDerivLink(LMinvAGMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
        MDerivLink(LMinvAMom,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
        MDerivLink(MinvAGMom,LMinvMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
        MDerivLink(AMinvMom,LMinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu];
        MDerivLink(MinvAGMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b1[i]*tempDerLink[mu];
        MDerivLink(AMinvMom,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b1[i]*tempDerLink[mu];
 //      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu);
        for (int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(tempDer, tempDerLink[mu], mu);
 	der += tempDer;
 #endif
        std::cout<<GridLogMessage << "coef =  force contraction "<< i << "done "<< coef <<std::endl;
    //    roctxRangePop();
        }
    }
    std::cout<<GridLogMessage << "LaplaceEnd " <<std::endl;
 //  exit(-42);
  }
  void MDeriv(const GaugeField& in, GaugeField& der) {
    MDeriv(in,in, der);
  }
  void MDeriv(const GaugeField& left, const GaugeField& right,
              GaugeField& der) {
    der=Zero();
    MDerivInt(Mparam, left, right, der,prev_solnsMDeriv );
    std::cout <<GridLogDebug << "MDeriv:norm2(der) = "<<norm2(der)<<std::endl;
  }
  void MinvDeriv(const GaugeField& in, GaugeField& der) {
    std::vector< std::vector<GaugeLinkField> > prev_solns(4);
    der=Zero();
    MDerivInt(Gparam, in, in, der,prev_solnsMinvDeriv);
    std::cout <<GridLogDebug << "MinvDeriv:norm2(der) = "<<norm2(der)<<std::endl;
  }
  void MSquareRootInt(LaplacianRatParams &par, GaugeField& P, std::vector< std::vector<GaugeLinkField> > & prev_solns ){
    std::cout<<GridLogMessage << "LaplaceStart " <<std::endl;
    RealD fac = -1. / (double(4 * Nd));
    LapStencil.GaugeImport(Usav);
    LapStencilF.GaugeImport(UsavF);
    for(int nu=0; nu<Nd;nu++){
        GaugeLinkField P_nu = PeekIndex<LorentzIndex>(P, nu);
        GaugeLinkField Gp(P.Grid());
        Gp = par.offset * P_nu;
        ConjugateGradient<GaugeLinkField> CG(par.tolerance,10000);
    //    ConjugateGradient<GaugeLinkFieldF> CG_f(1.0e-8,10000);
        ChronoForecast< QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> , GaugeLinkField> Forecast;
        GaugeLinkField Gtemp(P.Grid());
        GaugeLinkField Gtemp2(P.Grid());
        for(int i =0;i<par.order;i++){
        QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
        Gtemp = Forecast(QuadOp, P_nu, prev_solns[nu]);
    #ifndef MIXED_CG
        CG(QuadOp,P_nu,Gtemp);
    #else
        QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
    //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
        MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
        MixedCG.InnerTolerance=par.tolerance;
        MixedCG(P_nu,Gtemp);
    #endif
    #if USE_CHRONO
        prev_solns[nu].push_back(Gtemp);
    #endif
        Gp += par.a0[i]*Gtemp; 
        LapStencil.M(Gtemp,Gtemp2);
        Gp += par.a1[i]*fac*Gtemp2; 
        }
        PokeIndex<LorentzIndex>(P, Gp, nu);
    }
    std::cout<<GridLogMessage << "LaplaceEnd " <<std::endl;
  }
  void MSquareRoot(GaugeField& P){
    std::vector< std::vector<GaugeLinkField> > prev_solns(4);
    MSquareRootInt(Mparam,P,prev_solns);
    std::cout <<GridLogDebug << "MSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
  }
  void MInvSquareRoot(GaugeField& P){
    std::vector< std::vector<GaugeLinkField> > prev_solns(4);
    MSquareRootInt(Gparam,P,prev_solns);
    std::cout <<GridLogDebug << "MInvSquareRoot:norm2(P) = "<<norm2(P)<<std::endl;
  }
  void M(const GaugeField& in, GaugeField& out) {
      out = in;
      std::vector< std::vector<GaugeLinkField> > prev_solns(4);
      MSquareRootInt(Mparam,out,prev_solns);
      MSquareRootInt(Mparam,out,prev_solns);
      std::cout <<GridLogDebug << "M:norm2(out) = "<<norm2(out)<<std::endl;
  }
  void Minv(const GaugeField& in, GaugeField& inverted){
      inverted = in;
      std::vector< std::vector<GaugeLinkField> > prev_solns(4);
      MSquareRootInt(Gparam,inverted,prev_solns);
      MSquareRootInt(Gparam,inverted,prev_solns);
      std::cout <<GridLogDebug << "Minv:norm2(inverted) = "<<norm2(inverted)<<std::endl;
  }
 private:
  std::vector<GaugeLinkField> U;
 };
 #undef MIXED_CG
 NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -100,9 +100,6 @@ class GaugeGroup {
  using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
  template <typename vtype>
  using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
  template <typename vtype>
  using iSUnAlgebraMatrix =
    iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >;
  static int su2subgroups(void) { return su2subgroups(group_name()); }
  //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -136,15 +133,6 @@ class GaugeGroup {
  typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
  typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;
  typedef iSUnAlgebraMatrix<vComplex>  vAlgebraMatrix;
  typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF;
  typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD;
  typedef Lattice<vAlgebraMatrix>  LatticeAlgebraMatrix;
  typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF;
  typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD;
  typedef iSU2Matrix<Complex> SU2Matrix;
  typedef iSU2Matrix<ComplexF> SU2MatrixF;
  typedef iSU2Matrix<ComplexD> SU2MatrixD;
@@ -172,7 +160,7 @@ class GaugeGroup {
    return generator(lieIndex, ta, group_name());
  }
-  static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
+  static void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
    return su2SubGroupIndex(i1, i2, su2_index, group_name());
  }
@@ -401,52 +389,6 @@ class GaugeGroup {
    }
  }
 // Ta are hermitian (?)
 // Anti herm is i Ta basis
 static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b)
 {
  conformable(in, out);
  GridBase *grid = out.Grid();
  LatticeComplex tmp(grid);
  Matrix ta;
  // Using Luchang's projection convention
  //  2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a
  autoView(out_v,out,AcceleratorWrite);
  autoView(in_v,in,AcceleratorRead);
  int N = ncolour;
  int NNm1 = N * (N - 1);
  int hNNm1= NNm1/2;
  RealD sqrt_2 = sqrt(2.0);
  Complex ci(0.0,1.0);
  for(int su2Index=0;su2Index<hNNm1;su2Index++){
    int i1, i2;
    su2SubGroupIndex(i1, i2, su2Index);
    int ax = su2Index*2;
    int ay = su2Index*2+1;
    accelerator_for(ss,grid->oSites(),1,{
 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
 	// trace( Ta x Ci in)
 	// Bet I need to move to real part with mult by -i
 	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
 	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
      });
  }
  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
    int k = diagIndex + 1; // diagIndex starts from 0
    int a = NNm1+diagIndex;
    RealD scale = 1.0/sqrt(2.0*k*(k+1));
    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
 	auto tmp = in_v[ss]()()(0,0);
 	for(int i=1;i<k;i++){
 	  tmp=tmp+in_v[ss]()()(i,i);
 	}
 	tmp = tmp - in_v[ss]()()(k,k)*k;
 	out_v[ss]()()(a,b) =imag(tmp) * scale;
      });
    }
 }
 };
 template <int ncolour>
--- a/Grid/qcd/utils/Metric.h
+++ b/Grid/qcd/utils/Metric.h
@@ -7,6 +7,7 @@ Source file: ./lib/qcd/hmc/integrators/Integrator.h
 Copyright (C) 2015
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,7 +34,12 @@ NAMESPACE_BEGIN(Grid);
 template <typename Field> 
 class Metric{
 protected:
  int triv;
 public:
  Metric(){this->triv=1;}
  int Trivial(){ return triv;}
 //printf("Metric::Trivial=%d\n",triv); ;
  virtual void ImportGauge(const Field&)   = 0;
  virtual void M(const Field&, Field&)     = 0;
  virtual void Minv(const Field&, Field&)  = 0;
@@ -41,6 +47,8 @@ public:
  virtual void MInvSquareRoot(Field&) = 0;
  virtual void MDeriv(const Field&, Field&) = 0;
  virtual void MDeriv(const Field&, const Field&, Field&) = 0;
  virtual void MinvDeriv(const Field&, Field&) = 0;
 //  virtual void MinvDeriv(const Field&, const Field&, Field&) = 0;
 };
@@ -48,23 +56,36 @@ public:
 template <typename Field>
 class TrivialMetric : public Metric<Field>{
 public:
 //  TrivialMetric(){this->triv=1;printf("TrivialMetric::triv=%d\n",this->Trivial());}
  virtual void ImportGauge(const Field&){};
  virtual void M(const Field& in, Field& out){
 //    printf("M:norm=%0.15e\n",norm2(in));
    std::cout << GridLogIntegrator << " M:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = in;
  }
  virtual void Minv(const Field& in, Field& out){
    std::cout << GridLogIntegrator << " Minv:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = in;
  }
  virtual void MSquareRoot(Field& P){
    std::cout << GridLogIntegrator << " MSquareRoot:norm(P)= " << std::sqrt(norm2(P)) << std::endl;
    // do nothing
  }
  virtual void MInvSquareRoot(Field& P){
    std::cout << GridLogIntegrator << " MInvSquareRoot:norm(P)= " << std::sqrt(norm2(P)) << std::endl;
    // do nothing
  }
  virtual void MDeriv(const Field& in, Field& out){
    std::cout << GridLogIntegrator << " MDeriv:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = Zero();
  }
  virtual void MinvDeriv(const Field& in, Field& out){
    std::cout << GridLogIntegrator << " MinvDeriv:norm(in)= " << std::sqrt(norm2(in)) << std::endl;
    out = Zero();
  }
  virtual void MDeriv(const Field& left, const Field& right, Field& out){
    std::cout << GridLogIntegrator << " MDeriv:norm(left)= " << std::sqrt(norm2(left)) << std::endl;
    std::cout << GridLogIntegrator << " MDeriv:norm(right)= " << std::sqrt(norm2(right)) << std::endl;
    out = Zero();
  }
@@ -101,14 +122,15 @@ public:
    // Generate gaussian momenta
    Implementation::generate_momenta(Mom, sRNG, pRNG);
    // Modify the distribution with the metric
 //    if(M.Trivial()) return;
    M.MSquareRoot(Mom);
    if (1) {
      // Auxiliary momenta
      // do nothing if trivial, so hide in the metric
      MomentaField AuxMomTemp(Mom.Grid());
-      Implementation::generate_momenta(AuxMom, sRNG, pRNG);
+      Implementation::generate_momenta(AuxMom, sRNG,pRNG);
-      Implementation::generate_momenta(AuxField, sRNG, pRNG);
+      Implementation::generate_momenta(AuxField, sRNG,pRNG);
      // Modify the distribution with the metric
      // Aux^dag M Aux
      M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp
@@ -117,11 +139,12 @@ public:
  // Correct
  RealD MomentaAction(){
    static RealD Saux=0.,Smom=0.;
    MomentaField inv(Mom.Grid());
    inv = Zero();
    M.Minv(Mom, inv);
-    LatticeComplex Hloc(Mom.Grid());
+    LatticeComplex Hloc(Mom.Grid()); Hloc = Zero();
-    Hloc = Zero();
+    LatticeComplex Hloc2(Mom.Grid()); Hloc2 = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      // This is not very general
      // hide in the metric
@@ -129,8 +152,15 @@ public:
      auto inv_mu = PeekIndex<LorentzIndex>(inv, mu);
      Hloc += trace(Mom_mu * inv_mu);
    }
    auto Htmp1 = TensorRemove(sum(Hloc));
    std::cout << GridLogMessage << "S:dSmom = " << Htmp1.real()-Smom << "\n";
    Smom=Htmp1.real()/HMC_MOMENTUM_DENOMINATOR;
-    if (1) {
+
 //    if(!M.Trivial()) 
    {
      // Auxiliary Fields
      // hide in the metric
      M.M(AuxMom, inv);
@@ -140,13 +170,18 @@ public:
        auto inv_mu = PeekIndex<LorentzIndex>(inv, mu);
        auto am_mu = PeekIndex<LorentzIndex>(AuxMom, mu);
        auto af_mu = PeekIndex<LorentzIndex>(AuxField, mu);
-        Hloc += trace(am_mu * inv_mu);// p M p
+        Hloc += trace(am_mu * inv_mu);
-        Hloc += trace(af_mu * af_mu);
+        Hloc2 += trace(af_mu * af_mu);
      }
    }
    auto Htmp2 = TensorRemove(sum(Hloc))-Htmp1;
    std::cout << GridLogMessage << "S:dSaux = " << Htmp2.real()-Saux << "\n";
    Saux=Htmp2.real();
-    auto Hsum = TensorRemove(sum(Hloc));
+    auto Hsum = TensorRemove(sum(Hloc))/HMC_MOMENTUM_DENOMINATOR;
-    return Hsum.real();
+    auto Hsum2 = TensorRemove(sum(Hloc2));
    std::cout << GridLogIntegrator << "MomentaAction: " <<  Hsum.real()+Hsum2.real() << std::endl;
    return Hsum.real()+Hsum2.real();
  }
  // Correct
@@ -157,15 +192,17 @@ public:
    MomentaField MDer(in.Grid());
    MomentaField X(in.Grid());
    X = Zero();
-    M.Minv(in, X);  // X = G in
+    M.MinvDeriv(in, MDer);  // MDer = U * dS/dU
-    M.MDeriv(X, MDer);  // MDer = U * dS/dU
+    der = -1.0* Implementation::projectForce(MDer);  // Ta if gauge fields
-    der = Implementation::projectForce(MDer);  // Ta if gauge fields
+//    std::cout << GridLogIntegrator << " DerivativeU: norm(in)= " << std::sqrt(norm2(in)) << std::endl;
 //    std::cout << GridLogIntegrator << " DerivativeU: norm(der)= " << std::sqrt(norm2(der)) << std::endl;
  }
  void AuxiliaryFieldsDerivative(MomentaField& der){
    der = Zero();
-    if (1){
+//    if(!M.Trivial()) 
    {
      // Auxiliary fields
      MomentaField der_temp(der.Grid());
      MomentaField X(der.Grid());
@@ -173,6 +210,7 @@ public:
      //M.M(AuxMom, X); // X = M Aux
      // Two derivative terms
      // the Mderiv need separation of left and right terms
    std::cout << GridLogIntegrator << " AuxiliaryFieldsDerivative:norm(AuxMom)= " << std::sqrt(norm2(AuxMom)) << std::endl;
      M.MDeriv(AuxMom, der); 
@@ -180,6 +218,7 @@ public:
      //M.MDeriv(X, AuxMom, der_temp); der += der_temp;
      der = -1.0*Implementation::projectForce(der);
      std::cout << GridLogIntegrator << " AuxiliaryFieldsDerivative:norm(der)= " << std::sqrt(norm2(der)) << std::endl;
    }
  }
@@ -189,22 +228,28 @@ public:
    // is the projection necessary here?
    // no for fields in the algebra
    der = Implementation::projectForce(der); 
    std::cout << GridLogIntegrator << " DerivativeP:norm(der)= " << std::sqrt(norm2(der)) << std::endl;
  }
  void update_auxiliary_momenta(RealD ep){
-    if(1){
+      std::cout << GridLogIntegrator << "AuxMom update_auxiliary_fields: " << std::sqrt(norm2(AuxMom)) << std::endl;
-      AuxMom -= ep * AuxField;
+      std::cout << GridLogIntegrator << "AuxField update_auxiliary_fields: " << std::sqrt(norm2(AuxField)) << std::endl;
    {
      AuxMom -= ep * AuxField * HMC_MOMENTUM_DENOMINATOR;
      std::cout << GridLogIntegrator << "AuxMom update_auxiliary_fields: " << std::sqrt(norm2(AuxMom)) << std::endl;
    }
  }
  void update_auxiliary_fields(RealD ep){
-    if (1) {
+//    if(!M.Trivial()) 
    {
      MomentaField tmp(AuxMom.Grid());
      MomentaField tmp2(AuxMom.Grid());
      M.M(AuxMom, tmp);
      // M.M(tmp, tmp2);
      AuxField += ep * tmp;  // M^2 AuxMom
      // factor of 2?
      std::cout << GridLogIntegrator << "AuxField update_auxiliary_fields: " << std::sqrt(norm2(AuxField)) << std::endl;
    }
  }
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -10,7 +10,6 @@
 // doesn't get found by the scripts/filelist during bootstrapping.
 private:
 template <ONLY_IF_SU>
 static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; }
 ////////////////////////////////////////////////////////////////////////
@@ -577,4 +576,3 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie
  LieRandomize(pRNG,g,1.0);
  GaugeTransform<Gimpl>(Umu,g);
 }
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -1133,13 +1133,4 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 NAMESPACE_END(Grid);
 #ifdef GRID_SYCL
 template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
 #endif
 #endif
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
@@ -218,10 +218,6 @@ public:
    // -------------------------------------------------
    // misc
    // -------------------------------------------------
    void discardhi(uint64_t z) {
      _s[3] += z;
      encrypt_counter();
    }
    // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
    // Advances e’s state ei to ei+z by any means equivalent to z
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -137,55 +137,5 @@ public:
 };
 ////////////////////////////////////////////////
 // Some machinery to streamline making a stencil 
 ////////////////////////////////////////////////
 class shiftSignal {
 public:
    enum {
        BACKWARD_CONST = 16,
        NO_SHIFT       = -1
    };
 };
 // TODO: put a check somewhere that BACKWARD_CONST > Nd!
 /*!  @brief signals that you want to go backwards in direction dir */
 inline int Back(const int dir) {
    // generalShift will use BACKWARD_CONST to determine whether we step forward or 
    // backward. Trick inspired by SIMULATeQCD. 
    return dir + shiftSignal::BACKWARD_CONST;
 }
 /*!  @brief shift one unit in direction dir */
 template<typename... Args>
 void generalShift(Coordinate& shift, int dir) {
    if (dir >= shiftSignal::BACKWARD_CONST) {
        dir -= shiftSignal::BACKWARD_CONST;
        shift[dir]+=-1;
    } else if (dir == shiftSignal::NO_SHIFT) {
        ; // do nothing
    } else {
        shift[dir]+=1;
    }
 }
 /*!  @brief follow a path of directions, shifting one unit in each direction */
 template<typename... Args>
 void generalShift(Coordinate& shift, int dir, Args... args) {
    if (dir >= shiftSignal::BACKWARD_CONST) {
        dir -= shiftSignal::BACKWARD_CONST;
        shift[dir]+=-1;
    } else if (dir == shiftSignal::NO_SHIFT) {
        ; // do nothing
    } else {
        shift[dir]+=1;
    }
    generalShift(shift, args...);
 }
 NAMESPACE_END(Grid);
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -706,7 +706,7 @@ public:
 	}
      }
    }
-    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -761,8 +761,7 @@ public:
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
-		   Parameters p=Parameters(),
+		   Parameters p=Parameters())
 		   bool preserve_shm=false)
  {
    face_table_computed=0;
    _grid    = grid;
@@ -856,9 +855,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();
-    // Allow for multiple stencils to exist simultaneously
+    _grid->ShmBufferFreeAll();
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();
    int maxl=2;
    u_simd_send_buf.resize(maxl);
--- a/Grid/tensors/Tensor_trace.h
+++ b/Grid/tensors/Tensor_trace.h
@@ -69,35 +69,6 @@ accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(t
  }
  return ret;
 }
 ////////////////////////////
 // Fast path traceProduct
 ////////////////////////////
 template<class S1 , class S2, IfNotGridTensor<S1> = 0, IfNotGridTensor<S2> = 0>
 accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2)
  -> decltype(arg1*arg2)
 {
  return arg1*arg2;
 }
 template<class vtype,class rtype,int N >
 accelerator_inline auto traceProduct(const iMatrix<vtype,N> &arg1,const iMatrix<rtype,N> &arg2) -> iScalar<decltype(trace(arg1._internal[0][0]*arg2._internal[0][0]))>
 {
  iScalar<decltype( trace(arg1._internal[0][0]*arg2._internal[0][0] )) > ret;
  zeroit(ret._internal);
  for(int i=0;i<N;i++){
  for(int j=0;j<N;j++){
    ret._internal=ret._internal+traceProduct(arg1._internal[i][j],arg2._internal[j][i]);
  }}
  return ret;
 }
 template<class vtype,class rtype >
 accelerator_inline auto traceProduct(const iScalar<vtype> &arg1,const iScalar<rtype> &arg2) -> iScalar<decltype(trace(arg1._internal*arg2._internal))>
 {
  iScalar<decltype(trace(arg1._internal*arg2._internal))> ret;
  ret._internal=traceProduct(arg1._internal,arg2._internal);
  return ret;
 }
 NAMESPACE_END(Grid);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -34,12 +34,9 @@ NAMESPACE_BEGIN(Grid);
  // These are the Grid tensors
  template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; };
-  template<class T>        struct isGridTensor<iScalar<T> >   : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T>        struct isGridTensor<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iVector<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iMatrix<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
  template <typename T>  using IfGridTensor    = Invoke<std::enable_if<isGridTensor<T>::value, int> >;
  template <typename T>  using IfNotGridTensor = Invoke<std::enable_if<!isGridTensor<T>::value, int> >;
  // Traits to identify scalars
  template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; };
@@ -404,12 +401,3 @@ NAMESPACE_BEGIN(Grid);
  };
 NAMESPACE_END(Grid);
 #ifdef GRID_SYCL
 template<typename T> struct
 sycl::is_device_copyable<T, typename std::enable_if<
 			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
 			      void>::type>
  : public std::true_type {};
 #endif
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -7,8 +7,6 @@ uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #define ENV_LOCAL_RANK_PALS    "PALS_LOCAL_RANKID"
 #define ENV_RANK_PALS          "PALS_RANKID"
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID"
@@ -149,7 +147,7 @@ void acceleratorInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
-    auto r=hipGetDeviceProperties(&gpu_props[i], i);
+    hipGetDeviceProperties(&gpu_props[i], i);
    hipDeviceProp_t prop; 
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
@@ -230,17 +228,8 @@ void acceleratorInit(void)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL)
  {
    rank = atoi(localRankStr);		
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_PALS   )) != NULL) { world_rank = atoi(localRankStr);}
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
  auto devices = cl::sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){
@@ -252,10 +241,9 @@ void acceleratorInit(void)
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
 #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
    if ( world_rank == 0) {
-      GPU_PROP_STR(vendor);
+    GPU_PROP_STR(vendor);
-      GPU_PROP_STR(version);
+    GPU_PROP_STR(version);
    //    GPU_PROP_STR(device_type);
    /*
    GPU_PROP(max_compute_units);
@@ -271,8 +259,7 @@ void acceleratorInit(void)
    GPU_PROP(single_fp_config);
    */
    //    GPU_PROP(double_fp_config);
-      GPU_PROP(global_mem_size);
+    GPU_PROP(global_mem_size);
    }
  }
  if ( world_rank == 0 ) {
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -225,8 +225,6 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
 inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
@@ -255,13 +253,17 @@ inline int  acceleratorIsCommunicable(void *ptr)
 #define GRID_SYCL_LEVEL_ZERO_IPC
 NAMESPACE_END(Grid);
-
+#if 0
-// Force deterministic reductions
+#include <CL/sycl.hpp>
-#define SYCL_REDUCTION_DETERMINISTIC
+#include <CL/sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
 #else
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
 NAMESPACE_BEGIN(Grid);
@@ -285,24 +287,23 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-    unsigned long nt=acceleratorThreads();				\
+      unsigned long nt=acceleratorThreads();				\
-    if(nt < 8)nt=8;							\
+      unsigned long unum1 = num1;					\
-    unsigned long unum1 = num1;						\
+      unsigned long unum2 = num2;					\
-    unsigned long unum2 = num2;						\
+      if(nt < 8)nt=8;							\
-    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
+      cl::sycl::range<3> local {nt,1,nsimd};				\
-    cl::sycl::range<3> local {nt,1,nsimd};				\
+      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
-    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+      cgh.parallel_for(					\
-    cgh.parallel_for(							\
+      cl::sycl::nd_range<3>(global,local), \
-		     cl::sycl::nd_range<3>(global,local),		\
+      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
-		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+      [[intel::reqd_sub_group_size(16)]]	      \
-		     [[intel::reqd_sub_group_size(16)]]			\
+      {						      \
-		     {							\
+      auto iter1    = item.get_global_id(0);	      \
-		       auto iter1    = item.get_global_id(0);		\
+      auto iter2    = item.get_global_id(1);	      \
-		       auto iter2    = item.get_global_id(1);		\
+      auto lane     = item.get_global_id(2);	      \
-		       auto lane     = item.get_global_id(2);		\
+      { __VA_ARGS__ };				      \
-		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
+     });	   			              \
-		     });						\
+    });
  });
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
@@ -404,7 +405,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 #define accelerator_barrier(dummy)				\
  {								\
-    auto r=hipStreamSynchronize(computeStream);			\
+    hipStreamSynchronize(computeStream);			\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -437,21 +438,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };
-inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
+inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
 inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); };
+inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
 #endif
@@ -576,11 +575,4 @@ accelerator_inline void acceleratorFence(void)
  return;
 }
 inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 {
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
  acceleratorCopySynchronise();
 }
 NAMESPACE_END(Grid);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -77,10 +77,6 @@ feenableexcept (unsigned int excepts)
 }
 #endif
 #ifndef HOST_NAME_MAX
 #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #endif
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////
@@ -397,9 +393,6 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
  /////////////////////////////////////////////////////////
  // Reporting
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -54,16 +54,15 @@ int main(int argc, char **argv)
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 24;
+  MD.MDsteps = 12;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 104;
+  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("HotStart");
+  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
@@ -88,7 +87,6 @@ int main(int argc, char **argv)
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
@@ -136,6 +134,7 @@ int main(int argc, char **argv)
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ActionLevel<HMCWrapper::Field> Level3(4);
  ////////////////////////////////////
  // Strange action
@@ -192,7 +191,7 @@ int main(int argc, char **argv)
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level1.push_back(&Jacobian);
+  if( ApplySmearing ) Level2.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
@@ -201,7 +200,7 @@ int main(int argc, char **argv)
  /////////////////////////////////////////////////////////////
  //  GaugeAction.is_smeared = ApplySmearing;
  GaugeAction.is_smeared = true;
-  Level2.push_back(&GaugeAction);
+  Level3.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
@@ -211,11 +210,10 @@ int main(int argc, char **argv)
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
-
+  TheHMC.TheAction.push_back(Level3);
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  TheHMC.Run(SmearingPolicy); // for smearing
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -1,226 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Copyright (C) 2023
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 using namespace Grid;
 int main(int argc, char **argv)
 {
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 24;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.StartingType     =std::string("ColdStart");
  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.saveSmeared   = true;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 2.37;
  Real light_mass   = 0.0047;
  Real strange_mass = 0.0186;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; // Scale factor one, Shamir
  RealD c   = 0.0;
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 1.0e-2;
  OFRp.hi       = 64;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 14;
  OFRp.precision= 40;
  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeField Uhot(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  bool ApplySmearing = true;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 CG,
 	 CG, CG,
 	 CG, CG, 
 	 OFRp, false);
  EOFA.is_smeared = ApplySmearing;
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Quotients[h]->is_smeared = ApplySmearing;
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // lnDetJacobianAction
  /////////////////////////////////////////////////////////////
  double rho = 0.1;  // smearing parameter
  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
  if( ApplySmearing ) Level1.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  GaugeAction.is_smeared = ApplySmearing;
  Level2.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 } // main
--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -1,226 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Copyright (C) 2023
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
 using namespace Grid;
 int main(int argc, char **argv)
 {
  std::cout << std::setprecision(12);
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 24;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.StartingType     =std::string("ColdStart");
  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.saveSmeared   = true;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 2.37;
  Real light_mass   = 0.0047;
  Real strange_mass = 0.0186;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; // Scale factor one, Shamir
  RealD c   = 0.0;
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 1.0e-2;
  OFRp.hi       = 64;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 14;
  OFRp.precision= 40;
  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeField Uhot(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  bool ApplySmearing = false;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 CG,
 	 CG, CG,
 	 CG, CG, 
 	 OFRp, false);
  EOFA.is_smeared = ApplySmearing;
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Quotients[h]->is_smeared = ApplySmearing;
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // lnDetJacobianAction
  /////////////////////////////////////////////////////////////
  double rho = 0.1;  // smearing parameter
  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
  if( ApplySmearing ) Level1.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  GaugeAction.is_smeared = ApplySmearing;
  Level2.push_back(&GaugeAction);
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage <<  std::endl;
  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  TheHMC.Run(SmearingPolicy); // for smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -1,350 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  CartesianCommunicator::BarrierWorld();
  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
  int threads = GridThread::GetThreads();
   // Typedefs to simplify notation
  typedef WilsonImplD FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef MobiusEOFAFermionD FermionEOFAAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  MD.name    = std::string("Force Gradient");
  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  // MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
  MD.MDsteps =  3;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
  HMCparams.Trajectories     = 1;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_DDHMC_lat";
  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
  Real beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
  Real light_mass_dir = 0.01;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
  int SP_iters=9000;
  RationalActionParams OFRp; // Up/down
  OFRp.lo       = 6.0e-5;
  OFRp.hi       = 90.0;
  OFRp.inv_pow  = 2;
  OFRp.MaxIter  = SP_iters; // get most shifts by 2000, stop sharing space
  OFRp.action_tolerance= 1.0e-8;
  OFRp.action_degree   = 18;
  OFRp.md_tolerance= 1.0e-7;
  OFRp.md_degree   = 14;
  //  OFRp.degree   = 20; converges
  //  OFRp.degree   = 16;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
  std::vector<RealD> ActionTolByPole({
      //      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      3.0e-7,1.0e-7,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8
    });
  std::vector<RealD> MDTolByPole({
      //      1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
      //      1.0e-6,3.0e-7,1.0e-7,1.0e-7,
      1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8
    });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  ////////////////////////////////////////////////////////////////
  // Domain decomposed
  ////////////////////////////////////////////////////////////////
  Coordinate latt4  = GridPtr->GlobalDimensions();
  Coordinate mpi    = GridPtr->ProcessorGrid();
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  Coordinate CommDim(Nd);
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  Coordinate NonDirichlet(Nd+1,0);
  Coordinate Dirichlet(Nd+1,0);
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  //Dirichlet[1] = 0;
  //Dirichlet[2] = 0;
  //Dirichlet[3] = 0;
  // 
  Coordinate Block4(Nd);
  Block4[0] = Dirichlet[1];
  Block4[1] = Dirichlet[2];
  Block4[2] = Dirichlet[3];
  Block4[3] = Dirichlet[4];
  int Width=4;
  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplD::Field>(Block4,Width));
  //////////////////////////
  // Fermion Grids
  //////////////////////////
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD  U(GridPtr); U=Zero();
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  std::cout << "loaded NERSC gauge field"<<std::endl;
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionAction::ImplParams ParamsDir(boundary);
  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDir.partialDirichlet=0;
  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
  double StoppingCondition = 1e-8;
  double MDStoppingCondition = 1e-8;
  double MDStoppingConditionLoose = 1e-8;
  double MDStoppingConditionStrange = 1e-8;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(3);
  ActionLevel<HMCWrapper::Field> Level3(15);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.1;
  SFRp.hi       = 25.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-8;
  SFRp.mdtolerance= 2.0e-6;
  SFRp.degree   = 12;
  SFRp.precision= 50;
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  std::vector<int> dirichlet_den;
  std::vector<int> dirichlet_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
  }
  for(int h=0;h<n_hasenbusch;h++){
    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
  }
  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  std::vector<LinearOperatorD *> LinOpD;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
    std::cout << "/ det D("<<light_num[h]<<")";
    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
    std::cout << std::endl;
    FermionAction::ImplParams ParamsNum(boundary);
    FermionAction::ImplParams ParamsDen(boundary);
    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
    else                      ParamsNum.dirichlet = NonDirichlet;
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
    else                      ParamsNum.partialDirichlet = 0;
    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
    else                      ParamsDen.partialDirichlet = 0;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    double conv  = MDStoppingCondition;
    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
    if(h!=0) {
      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
    } else {
      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
    }
  }
  for(int h=0;h<Bdys.size();h++){
    Bdys[h]->SetTolerances(ActionTolByPole,MDTolByPole);
  }
  int nquo=Quotients.size();
  Level1.push_back(Bdys[0]);
  Level1.push_back(Bdys[1]);
  Level2.push_back(Quotients[0]);
  for(int h=1;h<nquo-1;h++){
    Level2.push_back(Quotients[h]);
  }
  Level2.push_back(Quotients[nquo-1]);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
@@ -343,7 +343,7 @@ int main(int argc, char **argv) {
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.1;
-  SFRp.hi       = 30.0;
+  SFRp.hi       = 25.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-5;
  SFRp.mdtolerance= 2.0e-4;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@@ -128,7 +128,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-#if 0
+#if 1
      RealD delta=1.e-4;
      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
@@ -180,7 +180,7 @@ int main(int argc, char **argv) {
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  12;
+  MD.MDsteps =  14;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
+  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
@@ -218,14 +218,15 @@ int main(int argc, char **argv) {
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
-  RealD beta         = 2.13;
+  Real beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
  //  Real light_mass     = 7.8e-3;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
+  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
+  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
@@ -276,20 +277,20 @@ int main(int argc, char **argv) {
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-14;
+  double StoppingCondition = 1e-9;
-  double MDStoppingCondition = 1e-9;
+  double MDStoppingCondition = 1e-8;
-  double MDStoppingConditionLoose = 1e-9;
+  double MDStoppingConditionLoose = 1e-8;
-  double MDStoppingConditionStrange = 1e-9;
+  double MDStoppingConditionStrange = 1e-8;
-  double MaxCGIterations = 50000;
+  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
+  //  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(2);
+  ActionLevel<HMCWrapper::Field> Level2(1);
-  ActionLevel<HMCWrapper::Field> Level3(4);
+  ActionLevel<HMCWrapper::Field> Level3(15);
  ////////////////////////////////////
  // Strange action
@@ -299,11 +300,11 @@ int main(int argc, char **argv) {
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.8;
+  SFRp.lo       = 0.1;
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-12;
+  SFRp.tolerance= 1.0e-8;
-  SFRp.mdtolerance= 1.0e-9;
+  SFRp.mdtolerance= 2.0e-6;
  SFRp.degree   = 10;
  SFRp.precision= 50;
@@ -354,10 +355,8 @@ int main(int argc, char **argv) {
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
-	 //	 ActionCGL, ActionCGR,
+	 ActionCGL, ActionCGR,
-	 //	 DerivativeCGL, DerivativeCGR,
+	 DerivativeCGL, DerivativeCGR,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);
@@ -444,14 +443,13 @@ int main(int argc, char **argv) {
  }
  int nquo=Quotients.size();
  for(int h=0;h<nquo;h++){
-    Level1.push_back(Quotients[h]);
+    Level2.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
@@ -1,268 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  std::cout << " Grid Initialise "<<std::endl;
  Grid_init(&argc, &argv);
  CartesianCommunicator::BarrierWorld();
  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
  int threads = GridThread::GetThreads();
   // Typedefs to simplify notation
  typedef WilsonImplD FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef MobiusEOFAFermionD FermionEOFAAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef WilsonImplF FermionImplPolicyF;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  MD.name    = std::string("Force Gradient");
  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  //  MD.name    = std::string("MinimumNorm2");
  // TrajL = 2
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
  MD.MDsteps =  8;
  MD.trajL   = 0.5;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
  HMCparams.Trajectories     = 20;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  HMCparams.StartingType     =std::string("ColdStart");
  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_HMC_lat";
  CPparams.rng_prefix    = "ckpoint_HMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
  RealD beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
  //  Real light_mass     = 7.8e-3;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  ////////////////////////////////////////////////////////////////
  // Domain decomposed
  ////////////////////////////////////////////////////////////////
  Coordinate latt4  = GridPtr->GlobalDimensions();
  Coordinate mpi    = GridPtr->ProcessorGrid();
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  //////////////////////////
  // Fermion Grids
  //////////////////////////
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD  U(GridPtr); U=Zero();
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  std::cout << "loaded NERSC gauge field"<<std::endl;
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
  double StoppingCondition = 1e-14;
  double MDStoppingCondition = 1e-9;
  double MDStoppingConditionLoose = 1e-9;
  double MDStoppingConditionStrange = 1e-9;
  double MaxCGIterations = 50000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
  ActionLevel<HMCWrapper::Field> Level3(4);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.8;
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-12;
  SFRp.mdtolerance= 1.0e-9;
  SFRp.degree   = 10;
  SFRp.precision= 50;
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 SFRp, true);
  Level2.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass); 
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
  }
  for(int h=0;h<n_hasenbusch;h++){
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  std::vector<LinearOperatorD *> LinOpD;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
    std::cout << "/ det D("<<light_num[h]<<")";
    std::cout << std::endl;
    FermionAction::ImplParams ParamsNum(boundary);
    FermionAction::ImplParams ParamsDen(boundary);
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    double conv  = MDStoppingCondition;
    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG,CG));
  }
  int nquo=Quotients.size();
  for(int h=0;h<nquo;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1p1fEOFA_4Gev.cc
+++ b/HMC/Mobius2p1p1fEOFA_4Gev.cc
@@ -0,0 +1,637 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu
 Author: David Murphy
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
 #endif
 // second level EOFA
 #undef EOFA_H
 #undef USE_OBC
 #define DO_IMPLICIT
 NAMESPACE_BEGIN(Grid);
  /*
   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
   *    -- Store the single prec action operator.
   *    -- Clone the gauge field from the operator function argument.
   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
   */
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
      /* Debugging instances of objects; references are stored
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
      */
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
      // Assumption made in code to extract gauge field
      // We could avoid storing LinopD reference alltogether ?
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      ////////////////////////////////////////////////////////////////////////////////////
      // Must snarf a single precision copy of the gauge field in Linop_d argument
      ////////////////////////////////////////////////////////////////////////////////////
      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
      GridBase * GridPtrF = SinglePrecGrid4;
      GridBase * GridPtrD = FermOpD.Umu.Grid();
      GaugeFieldF     U_f  (GridPtrF);
      GaugeLinkFieldF Umu_f(GridPtrF);
      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
      ////////////////////////////////////////////////////////////////////////////////////
      // Moving this to a Clone method of fermion operator would allow to duplicate the 
      // physics parameters and decrease gauge field copies
      ////////////////////////////////////////////////////////////////////////////////////
      GaugeLinkFieldD Umu_d(GridPtrD);
      for(int mu=0;mu<Nd*2;mu++){ 
 	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
 	precisionChange(Umu_f,Umu_d);
 	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
      }
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionD FermionAction;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionD FermionEOFAAction;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionAction::FermionField FermionField;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  HMCparameters HMCparams;
 #if 1
  {
    XmlReader  HMCrd("HMCparameters.xml");
    read(HMCrd,"HMCparameters",HMCparams);
  }
 #else
  {
 //    HMCparameters HMCparams;
  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
    HMCparams.StartingType     =std::string("CheckpointStart");
    HMCparams.StartTrajectory  =7;
    HMCparams.SW  =4;
    HMCparams.Trajectories     =1000;
    HMCparams.NoMetropolisUntil=0;
    HMCparams.MD.name          =std::string("Force Gradient");
    HMCparams.MD.MDsteps       = 10;
    HMCparams.MD.trajL         = 1.0;
  }
 #endif
 #ifdef DO_IMPLICIT
 //    typedef GenericHMCRunner<ImplicitLeapFrog> HMCWrapper; 
  typedef GenericHMCRunner<ImplicitMinimumNorm2> HMCWrapper; 
  HMCparams.MD.name          =std::string("ImplicitMinimumNorm2");
 #else
 //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
 //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  HMCparams.MD.name          =std::string("ForceGradient");
 #endif
  std::cout << GridLogMessage<< HMCparams <<std::endl;
  HMCWrapper TheHMC(HMCparams);
  TheHMC.ReadCommandLine(argc, argv);
  { 
    XmlWriter HMCwr("HMCparameters.xml.out");
    write(HMCwr,"HMCparameters",TheHMC.Parameters);
  }
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 5.983;
  std::cout << GridLogMessage << " beta  "<< beta << std::endl;
  Real light_mass   = 0.00049;
  Real strange_mass = 0.0158;
  Real charm_mass = 0.191;
  Real pv_mass    = 1.0;
  RealD M5  = 1.4;
  RealD b   = 2.0; 
  RealD c   = 1.0;
  // Copied from paper
 //  std::vector<Real> hasenbusch({ 0.045 }); // Paper values from F1 incorrect run
  std::vector<Real> hasenbusch({ 0.0038, 0.0145, 0.045, 0.108 , 0.25, 0.51 }); // Paper values from F1 incorrect run
  std::vector<Real> hasenbusch2({ 0.4 }); // Paper values from F1 incorrect run
 //  RealD eofa_mass=0.05 ;
  ///////////////////////////////////////////////////////////////////////////////////////////////
  //Bad choices with large dH. Equalising force L2 norm was not wise.
  ///////////////////////////////////////////////////////////////////////////////////////////////
  //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 }); 
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  Coordinate latt  = GridDefaultLatt();
  Coordinate mpi   = GridDefaultMpi();
  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
 //  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto UGrid_f    = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
 #ifndef USE_OBC
 //  IwasakiGaugeActionR GaugeAction(beta);
  WilsonGaugeActionR GaugeAction(beta);
 #else
  std::vector<Complex> boundaryG = {1,1,1,0};
  WilsonGaugeActionR::ImplParams ParamsG(boundaryG);
  WilsonGaugeActionR GaugeAction(beta,ParamsG);
 #endif
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(UGrid_f);
  // These lines are unecessary if BC are all periodic
 #ifndef USE_OBC
  std::vector<Complex> boundary = {1,1,1,-1};
 #else
  std::vector<Complex> boundary = {1,1,1,0};
 #endif
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
  double ActionStoppingCondition     = 1e-8;
  double DerivativeStoppingCondition = 1e-8;
  double MaxCGIterations =  100000;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(HMCparams.SW);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
  // DJM: setup for EOFA ratio (Mobius)
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 0.99; // How do I know this on F1?
  OFRp.hi       = 20;
  OFRp.MaxIter  = 100000;
  OFRp.tolerance= 1.0e-12;
  OFRp.degree   = 12;
  OFRp.precision= 50;
  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, charm_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, strange_mass, strange_mass, charm_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , charm_mass, strange_mass,      charm_mass, -1.0, 1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, charm_mass, strange_mass,      charm_mass, -1.0, 1, M5, b, c);
 #ifdef EOFA_H
  MobiusEOFAFermionD Strange2_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , eofa_mass, eofa_mass, charm_mass , 0.0, -1, M5, b, c);
  MobiusEOFAFermionF Strange2_Op_LF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, eofa_mass, eofa_mass, charm_mass , 0.0, -1, M5, b, c);
  MobiusEOFAFermionD Strange2_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , charm_mass , eofa_mass,      charm_mass , -1.0, 1, M5, b, c);
  MobiusEOFAFermionF Strange2_Op_RF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, charm_mass , eofa_mass,      charm_mass , -1.0, 1, M5, b, c);
 #endif
  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 50000;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
 #ifdef EOFA_H
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange2_LinOp_L (Strange2_Op_L);
  LinearOperatorEOFAD Strange2_LinOp_R (Strange2_Op_R);
  LinearOperatorEOFAF Strange2_LinOp_LF(Strange2_Op_LF);
  LinearOperatorEOFAF Strange2_LinOp_RF(Strange2_Op_RF);
 #endif
  MxPCG_EOFA ActionCGL(ActionStoppingCondition,
 		       MX_inner,
 		       MaxCGIterations,
 		       UGrid_f,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
 #ifdef EOFA_H
  MxPCG_EOFA ActionCGL2(ActionStoppingCondition,
 		       MX_inner,
 		       MaxCGIterations,
 		       UGrid_f,
 		       FrbGridF,
 		       Strange2_Op_LF,Strange2_Op_L,
 		       Strange2_LinOp_LF,Strange2_LinOp_L);
 #endif
  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
 			   MX_inner,
 			   MaxCGIterations,
 			   UGrid_f,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
 #ifdef EOFA_H
  MxPCG_EOFA DerivativeCGL2(DerivativeStoppingCondition,
 			   MX_inner,
 			   MaxCGIterations,
 			   UGrid_f,
 			   FrbGridF,
 			   Strange2_Op_LF,Strange2_Op_L,
 			   Strange2_LinOp_LF,Strange2_LinOp_L);
 #endif
  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
 		       MX_inner,
 		       MaxCGIterations,
 		       UGrid_f,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
 #ifdef EOFA_H
  MxPCG_EOFA ActionCGR2(ActionStoppingCondition,
 		       MX_inner,
 		       MaxCGIterations,
 		       UGrid_f,
 		       FrbGridF,
 		       Strange2_Op_RF,Strange2_Op_R,
 		       Strange2_LinOp_RF,Strange2_LinOp_R);
 #endif
  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
 			   MX_inner,
 			   MaxCGIterations,
 			   UGrid_f,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
 #ifdef EOFA_H
  MxPCG_EOFA DerivativeCGR2(DerivativeStoppingCondition,
 			   MX_inner,
 			   MaxCGIterations,
 			   UGrid_f,
 			   FrbGridF,
 			   Strange2_Op_RF,Strange2_Op_R,
 			   Strange2_LinOp_RF,Strange2_LinOp_R);
 #endif
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCGL, ActionCGR,
 	 DerivativeCGL, DerivativeCGR,
 	 OFRp, true);
 #ifdef EOFA_H
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA2(Strange2_Op_L, Strange2_Op_R, 
 	 ActionCG, 
 	 ActionCGL2, ActionCGR2,
 	 DerivativeCGL2, DerivativeCGR2,
 	 OFRp, true);
 #endif
  Level1.push_back(&EOFA);
 #ifdef EOFA_H
  Level1.push_back(&EOFA2);
 #endif
 #else
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCG, ActionCG,
 	 ActionCG, ActionCG,
 	 //         DerivativeCG, DerivativeCG,
 	 OFRp, true);
  Level1.push_back(&EOFA);
 #endif
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  int n_hasenbusch2 = hasenbusch2.size();
  light_den.push_back(charm_mass);
  for(int h=0;h<n_hasenbusch2;h++){
    light_den.push_back(hasenbusch2[h]);
    light_num.push_back(hasenbusch2[h]);
  }
  light_num.push_back(pv_mass);
  //////////////////////////////////////////////////////////////
  // Forced to replicate the MxPCG and DenominatorsF etc.. because
  // there is no convenient way to "Clone" physics params from double op
  // into single op for any operator pair.
  // Same issue prevents using MxPCG in the Heatbath step
  //////////////////////////////////////////////////////////////
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  std::vector<FermionActionF *> DenominatorsF;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  for(int h=0;h<light_den.size();h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
 #ifdef MIXED_PRECISION
    ////////////////////////////////////////////////////////////////////////////
    // Mixed precision CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    double DerivativeStoppingConditionLoose = 1e-8;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*UGrid_f,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
    double conv  = DerivativeStoppingCondition;
    if (h<3) conv= DerivativeStoppingConditionLoose; // Relax on first two hasenbusch factors
    MPCG.push_back(new MxPCG(conv,
 			     MX_inner,
 			     MaxCGIterations,
 			     UGrid_f,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
 				   MX_inner,
 				   MaxCGIterations,
 				   UGrid_f,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
 #else
    ////////////////////////////////////////////////////////////////////////////
    // Standard CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
 #endif
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  NoSmearing<HMCWrapper::ImplPolicy> S;
 #ifndef DO_IMPLICIT
  TrivialMetric<HMCWrapper::ImplPolicy::Field> Mtr;
 #else
    LaplacianRatParams gpar(2),mpar(2);
    gpar.offset = 1.;
    gpar.a0[0] = 500.;
    gpar.a1[0] = 0.;
    gpar.b0[0] = 0.25;
    gpar.b1[0] = 1.;
    gpar.a0[1] = -500.;
    gpar.a1[1] = 0.;
    gpar.b0[1] = 0.36;
    gpar.b1[1] = 1.2;
    gpar.b2=1.;
    mpar.offset = 1.;
    mpar.a0[0] =  -0.850891906532;
    mpar.a1[0] = -1.54707654538;
    mpar. b0[0] = 2.85557166137;
    mpar. b1[0] = 5.74194794773;
    mpar.a0[1] = -13.5120056831218384729709214298;
    mpar.a1[1] = 1.54707654538396877086370295729;
    mpar.b0[1] = 19.2921090880640520026645390317;
    mpar.b1[1] = -3.54194794773029020262811172870;
    mpar.b2=1.;
    for(int i=0;i<2;i++){
       gpar.a1[i] *=16.;
       gpar.b1[i] *=16.;
       mpar.a1[i] *=16.;
       mpar.b1[i] *=16.;
    }
    gpar.b2 *= 16.*16.;
    mpar.b2 *= 16.*16.;
    ConjugateGradient<LatticeGaugeField> CG(1.0e-8,10000);
    LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64);
    std::cout << GridLogMessage << "LaplacianRat " << std::endl;
    gpar.tolerance=HMCparams.MD.RMHMCCGTol;
    mpar.tolerance=HMCparams.MD.RMHMCCGTol;
    std::cout << GridLogMessage << "gpar offset= " << gpar.offset <<std::endl;
    std::cout << GridLogMessage << " a0= " << gpar.a0 <<std::endl;
    std::cout << GridLogMessage << " a1= " << gpar.a1 <<std::endl;
    std::cout << GridLogMessage << " b0= " << gpar.b0 <<std::endl;
    std::cout << GridLogMessage << " b1= " << gpar.b1 <<std::endl;
    std::cout << GridLogMessage << " b2= " << gpar.b2 <<std::endl ;;
    std::cout << GridLogMessage << "mpar offset= " << mpar.offset <<std::endl;
    std::cout << GridLogMessage << " a0= " << mpar.a0 <<std::endl;
    std::cout << GridLogMessage << " a1= " << mpar.a1 <<std::endl;
    std::cout << GridLogMessage << " b0= " << mpar.b0 <<std::endl;
    std::cout << GridLogMessage << " b1= " << mpar.b1 <<std::endl;
    std::cout << GridLogMessage << " b2= " << mpar.b2 <<std::endl;
 //  Assumes PeriodicGimplR or D at the moment
    auto UGrid = TheHMC.Resources.GetCartesian("gauge");
 //    auto UGrid_f   = GridPtrF;
 //  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
 //    std::cout << GridLogMessage << " UGrid= " << UGrid <<std::endl;
 //    std::cout << GridLogMessage << " UGrid_f= " << UGrid_f <<std::endl;
    LaplacianAdjointRat<HMCWrapper::ImplPolicy, PeriodicGimplF> Mtr(UGrid, UGrid_f ,CG, gpar, mpar);
 #endif
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run(S,Mtr);  // no smearing
  Grid_finalize();
 } // main
--- a/MPI_benchmark/bench2.pbs
+++ b/MPI_benchmark/bench2.pbs
@@ -1,22 +0,0 @@
 #!/bin/bash
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
 $CMD
--- a/MPI_benchmark/compile-command
+++ b/MPI_benchmark/compile-command
@@ -1 +0,0 @@
 mpicxx  -fsycl halo_mpi.cc -o halo_mpi
--- a/MPI_benchmark/gpu_tile_compact.sh
+++ b/MPI_benchmark/gpu_tile_compact.sh
@@ -1,30 +0,0 @@
 #!/bin/bash
 export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 #unset EnableWalkerPartition
 #export EnableImplicitScaling=0
 #export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 numactl -m $PNUMA -N $NUMA  "$@"
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@@ -1,333 +0,0 @@
 #include <cassert>
 #include <complex>
 #include <memory>
 #include <vector>
 #include <algorithm>
 #include <array>
 #include <string>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <ctime>
 #include <sys/time.h>
 #include <mpi.h>
 /**************************************************************
 * GPU - GPU memory cartesian halo exchange benchmark
 * Config: what is the target
 **************************************************************
 */
 #undef ACC_CUDA
 #undef  ACC_HIP
 #define  ACC_SYCL
 #undef  ACC_NONE
 /**************************************************************
 * Some MPI globals
 **************************************************************
 */
 MPI_Comm WorldComm;
 MPI_Comm WorldShmComm;
 int WorldSize;
 int WorldRank;
 int WorldShmSize;
 int WorldShmRank;
 /**************************************************************
 * Allocate buffers on the GPU, SYCL needs an init call and context
 **************************************************************
 */
 #ifdef ACC_CUDA
 #include <cuda.h>
 void acceleratorInit(void){}
 void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = cudaMalloc((void **)&ptr,bytes);
  assert(err==cudaSuccess);
  return ptr;
 }
 void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
 #endif
 #ifdef ACC_HIP
 #include <hip/hip_runtime.h>
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
 inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
 #endif
 #ifdef ACC_SYCL
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 cl::sycl::queue *theAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
 #if 1
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theAccelerator = new sycl::queue (selectedDevice);
 #else
  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
  theAccelerator = new sycl::queue (selectedDevice);
 #endif
  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
 }
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
 #endif
 #ifdef ACC_NONE
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 /**************************************************************
 * Microsecond timer
 **************************************************************
 */
 inline double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
 }
 /**************************************************************
 * Main benchmark routine
 **************************************************************
 */
 void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
 {
  int64_t words = 3*4*2;
  int64_t face,vol;
  int Nd=cart_geom.size();
  /**************************************************************
   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
   * Allocate memory for these
   **************************************************************
   */
  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
  std::vector<void *> send_bufs;
  std::vector<void *> recv_bufs;
  size_t vw = face*words;
  size_t bytes = face*words*sizeof(double);
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(acceleratorAllocDevice(bytes));
      recv_bufs.push_back(acceleratorAllocDevice(bytes));
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(malloc(bytes));
      recv_bufs.push_back(malloc(bytes));
    }
  }
  /*********************************************************
   * Build cartesian communicator
   *********************************************************
   */
  int ierr;
  int rank;
  std::vector<int> coor(Nd);
  MPI_Comm communicator;
  std::vector<int> periodic(Nd,1);
  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&rank);
  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
  static int reported;
  if ( ! reported ) { 
    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
 	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
    reported =1 ;
  }
  /*********************************************************
   * Perform halo exchanges
   *********************************************************
   */
  for(int d=0;d<Nd;d++){
    if ( cart_geom[d]>1 ) {
      double t0=usecond();
      int from,to;
      MPI_Barrier(communicator);
      for(int n=0;n<ncall;n++){
 	void *xmit = (void *)send_bufs[d];
 	void *recv = (void *)recv_bufs[d];
 	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
 	xmit = (void *)send_bufs[Nd+d];
 	recv = (void *)recv_bufs[Nd+d];
 	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
      }
      MPI_Barrier(communicator);
      double t1=usecond();
      double dbytes    = bytes*WorldShmSize;
      double xbytes    = dbytes*2.0*ncall;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      if ( ! WorldRank ) {
 	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
      }
    }
  }
  /*********************************************************
   * Free memory
   *********************************************************
   */
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      acceleratorFreeDevice(send_bufs[d]);
      acceleratorFreeDevice(recv_bufs[d]);
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      free(send_bufs[d]);
      free(recv_bufs[d]);
    }
  }
 }
 /**************************************
 * Command line junk
 **************************************/
 std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
  if (itr != end && ++itr != end) {
    std::string payload(*itr);
    return payload;
  }
  return std::string("");
 }
 bool CmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
 void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
 {
  vec.resize(0);
  std::stringstream ss(str);
  int i;
  while (ss >> i){
    vec.push_back(i);
    if(std::ispunct(ss.peek()))
      ss.ignore();
  }
  return;
 }
 /**************************************
 * Command line junk
 **************************************/
 int main(int argc, char **argv)
 {
  std::string arg;
  acceleratorInit();
  MPI_Init(&argc,&argv);
  WorldComm = MPI_COMM_WORLD;
  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldComm     ,&WorldRank);
  MPI_Comm_size(WorldComm     ,&WorldSize);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  if ( WorldSize/WorldShmSize > 2) {
    printf("This benchmark is meant to run on at most two nodes only\n");
  }
  auto mpi =std::vector<int>({1,1,1,1});
  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
    CmdOptionIntVector(arg,mpi);
  } else {
    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
    exit(0);
  }
  if( !WorldRank ) {
    printf("***********************************\n");
    printf("%d ranks\n",WorldSize); 
    printf("%d ranks-per-node\n",WorldShmSize);
    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
    printf("Cartesian layout: ");
    for(int d=0;d<mpi.size();d++){
      printf("%d ",mpi[d]);
    }
    printf("\n");fflush(stdout);
    printf("***********************************\n");
  }
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking HOST memory MPI performance               \n");
    printf("=========================================================\n");fflush(stdout);
    printf("= L\t pkt bytes\t MB/s           \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,false,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking DEVICE memory MPI performance             \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,true,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= DONE   \n");
    printf("=========================================================\n");
  }
  MPI_Finalize();
 }
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -365,9 +365,15 @@ public:
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 #if 1
    typedef DomainWallFermionF Action;
    typedef typename Action::FermionField Fermion;
    typedef LatticeGaugeFieldF Gauge;
 #else
    typedef GparityDomainWallFermionF Action;
    typedef typename Action::FermionField Fermion;
    typedef LatticeGaugeFieldF Gauge;
 #endif
    ///////// Source preparation ////////////
    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
@@ -635,6 +641,170 @@ public:
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
  static double Laplace(int L)
  {
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark Laplace on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    RealD mass=0.1;
    RealD c1=9.0/8.0;
    RealD c2=-1.0/24.0;
    RealD u0=1.0;
 //    typedef ImprovedStaggeredFermionF Action;
 //    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
 //    typename Action::ImplParams params;
 //    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
 //  PeriodicGimplF
    typedef typename PeriodicGimplF::LinkField GaugeLinkFieldF;
    ///////// Source preparation ////////////
    GaugeLinkFieldF src   (FGrid); random(RNG4,src);
 //    GaugeLinkFieldF src_e (FrbGrid);
 //    GaugeLinkFieldF src_o (FrbGrid);
 //    GaugeLinkFieldF r_e   (FrbGrid);
 //    GaugeLinkFieldF r_o   (FrbGrid);
    GaugeLinkFieldF r_eo  (FGrid);
    {
 //     pickCheckerboard(Even,src_e,src);
 //     pickCheckerboard(Odd,src_o,src);
      const int num_cases = 1;
      std::string fmt("G/O/C  ");
      controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
      }; 
      for(int c=0;c<num_cases;c++) {
        CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField> LapStencilF(FGrid);
        QuadLinearOperator<CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField>,PeriodicGimplF::LinkField> QuadOpF(LapStencilF,c2,c1,1.);
        LapStencilF.GaugeImport(Umu);
 	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
 	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using Stencil Nc Laplace" <<std::endl;
 	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 //	  Ds.DhopEO(src_o,r_e,DaggerNo);
          QuadOpF.HermOp(src,r_eo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 //	  Ds.DhopEO(src_o,r_e,DaggerNo);
          QuadOpF.HermOp(src,r_eo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 //	double flops=(1146.0*volume)/2;
 	double flops=(2*2*8*216.0*volume);
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per node   "<< mflops/NN<<std::endl;
 	FGrid->Barrier();
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Quad Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Quad Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
 	FGrid->Barrier();
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
    }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
 };
@@ -662,6 +832,7 @@ int main (int argc, char ** argv)
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> staggered;
  std::vector<double> lap;
  int Ls=1;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -688,12 +859,20 @@ int main (int argc, char ** argv)
    staggered.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Laplace QuadOp 4D " <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
    double result = Benchmark::Laplace(L_list[l]) ;
    lap.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered \t\t Quad Laplace" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<< " \t\t "<< lap[l]<< std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -90,11 +90,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
+  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  Benchmark(Ls,Dirichlet);
@@ -105,11 +105,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
+  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  Benchmark(Ls,Dirichlet);
@@ -185,7 +185,6 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  GaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
@@ -308,14 +307,6 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      std::cout<<GridLogMessage << "RESULT" << std::endl;
      //      std::cout << result<<std::endl;
      std::cout << norm2(result)<<std::endl;
      std::cout<<GridLogMessage << "REF" << std::endl;
      std::cout << norm2(ref)<<std::endl;
      std::cout<<GridLogMessage << "ERR" << std::endl;
      std::cout << norm2(err)<<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -1,968 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_usqcd.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/blas/BatchedBlas.h>
 using namespace Grid;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 FILE * FP;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<"bytes\t MB/s uni  \t\t MB/s bidi "<<std::endl;
 };
 struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
 };
 class Benchmark {
 public:
  static void Decomposition (void ) {
    int threads = GridThread::GetThreads();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  static void Comms(void)
  {
    int Nloop=200;
    int nmu=0;
    int maxlat=32;
    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    Coordinate mpi_layout  = GridDefaultMpi();
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();
    fprintf(FP,"Communications\n\n");
    fprintf(FP,"Packet bytes, direction, GB/s per node\n");
    for(int lat=16;lat<=maxlat;lat+=8){
      //      for(int Ls=8;Ls<=8;Ls*=2){
      { int Ls=12;
 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
 	RealD ppn = Nrank/Nnode;
 	std::vector<HalfSpinColourVectorD *> xbuf(8);
 	std::vector<HalfSpinColourVectorD *> rbuf(8);
 	//Grid.ShmBufferFreeAll();
 	uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
 	  rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
 	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	//	int ncomm;
 	double dbytes;
        for(int dir=0;dir<8;dir++) {
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    std::vector<double> times(Nloop);
 	    for(int i=0;i<Nloop;i++){
 	      dbytes=0;	        
 	      double start=usecond();
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
 		int comm_proc=1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      } else { 
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
 	      Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 				  (void *)&rbuf[dir][0], recv_from_rank,
 				  bytes);
 	      dbytes+=bytes;
 	      double stop=usecond();
 	      t_time[i] = stop-start; // microseconds
 	    }
 	    timestat.statistics(t_time);
 	    dbytes=dbytes*ppn;
 	    double xbytes    = dbytes*0.5;
 	    double bidibytes = dbytes;
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
 		     <<xbytes/timestat.mean
 		     << "\t\t"
 		     << bidibytes/timestat.mean<< std::endl;
 	    fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
 	  }
 	}
 	for(int d=0;d<8;d++){
 	  acceleratorFreeDevice(xbuf[d]);
 	  acceleratorFreeDevice(rbuf[d]);
 	}
      }
    }
    fprintf(FP,"\n\n");
    return;
  }
  static void Memory(void)
  {
    const int Nvec=8;
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
    typedef iVector<vReal,Nvec> Vec;
    Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
    Coordinate mpi_layout  = GridDefaultMpi();
    fprintf(FP,"Memory Bandwidth\n\n");
    fprintf(FP,"Bytes, GB/s per node\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    //    uint64_t NP;
    uint64_t NN;
  uint64_t lmax=40;
 #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=8){
      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      //      NP= Grid.RankCount();
      NN =Grid.NodeCount();
      Vec rn ; random(sRNG,rn);
      LatticeVec z(&Grid); z=Zero();
      LatticeVec x(&Grid); x=Zero();
      LatticeVec y(&Grid); y=Zero();
      double a=2.0;
      uint64_t Nloop=NLOOP;
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 	       << "\t\t"<< bytes/time/NN <<std::endl;
      fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
    }
    fprintf(FP,"\n\n");
  };
  static void BLAS(void)
  {
    //int nbasis, int nrhs, int coarseVol
    int  basis[] = { 16,32,64 };
    int  rhs[]   = { 8,16,32 };
    int  vol  = 4*4*4*4;
    GridBLAS blas;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
      int M=basis[b];
      int N=rhs[r];
      int K=basis[b];
      int BATCH=vol;
      double p=blas.benchmark(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
    }}
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
      int M=basis[b];
      int N=rhs[r];
      int K=vol;
      int BATCH=vol;
      double p=blas.benchmark(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
    }}
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
      int M=rhs[r];
      int N=vol;
      int K=basis[b];
      int BATCH=vol;
      double p=blas.benchmark(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
    }}
    fprintf(FP,"\n\n\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  };
  static void SU4(void)
  {
    const int Nc4=4;
    typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
    Coordinate mpi_layout  = GridDefaultMpi();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    uint64_t NN;
    uint64_t lmax=32;
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=8){
      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      NN =Grid.NodeCount();
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
      //      double a=2.0;
      uint64_t Nloop=NLOOP;
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=x*y;
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
      double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 	       << "\t\t"<< bytes/time/NN <<std::endl;
    }
  };
  static double DWF(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    typedef DomainWallFermionF Action;
    typedef typename Action::FermionField Fermion;
    typedef LatticeGaugeFieldF Gauge;
    ///////// Source preparation ////////////
    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
    Fermion src   (FGrid); random(RNG5,src);
    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
    Fermion r_e   (FrbGrid);
    Fermion r_o   (FrbGrid);
    Fermion r_eo  (FGrid);
    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #ifdef AVX512
      const int num_cases = 3;
 #else 
      const int num_cases = 2;
 #endif      
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
 	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
      }; 
      for(int c=0;c<num_cases;c++) {
 	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	// Nc=3 gives
 	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
 	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
 	//	double flops=(1344.0*volume)/2;
 	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2;
 	double flops=(fps*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
  static double Staggered(int L)
  {
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    RealD mass=0.1;
    RealD c1=9.0/8.0;
    RealD c2=-1.0/24.0;
    RealD u0=1.0;
    typedef ImprovedStaggeredFermionF Action;
    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
    typename Action::ImplParams params;
    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
    ///////// Source preparation ////////////
    Fermion src   (FGrid); random(RNG4,src);
    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
    Fermion r_e   (FrbGrid);
    Fermion r_o   (FrbGrid);
    Fermion r_eo  (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
      const int num_cases = 2;
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
 	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Ds.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Ds.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1146.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
    }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
  static double Clover(int L)
  {
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark Clover on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    RealD mass=0.1;
    RealD csw=1.0;
    typedef WilsonCloverFermionF Action;
    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
    Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw);
    ///////// Source preparation ////////////
    Fermion src   (FGrid); random(RNG4,src);
    Fermion r     (FGrid);
    {
      const int num_cases = 1;
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
      }; 
      for(int c=0;c<num_cases;c++) {
 	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Dc.M(src,r);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344+ 24+6*6*8*2)*volume;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
    }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if (GlobalSharedMemory::WorldRank==0) { 
    FP = fopen("Benchmark_usqcd.csv","w");
  } else {
    FP = fopen("/dev/null","w");
  }
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
  Benchmark::Decomposition();
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
  int do_blas  =1;
  int sel=4;
  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;
  std::vector<double> clover;
  std::vector<double> dwf4;
  std::vector<double> staggered;
  int Ls=1;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
    clover.push_back(Benchmark::DWF(1,L_list[l]));
  }
  Ls=12;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
    double result = Benchmark::DWF(Ls,L_list[l]) ;
    dwf4.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
    double result = Benchmark::Staggered(L_list[l]) ;
    staggered.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  int NN=NN_global;
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Memory();
  }
  if ( do_blas ) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::BLAS();
 #endif
  }
  if ( do_su4 ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::SU4();
  }
  if ( do_comms ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Comms();
  }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
    fprintf(FP,"Per node summary table\n");
    fprintf(FP,"\n");
    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
    fprintf(FP,"\n");
    for(int l=0;l<L_list.size();l++){
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
    }
    fprintf(FP,"\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
    std::cout<<std::setprecision(3);
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  Grid_finalize();
  fclose(FP);
 }
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 set -e
-EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
+EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
-EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'
+EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
 echo "-- deploying Eigen source..."
-ARC=$(basename ${EIGEN_URL})
+ARC=`basename ${EIGEN_URL}`
 wget ${EIGEN_URL} --no-check-certificate
 if command -v sha256sum; then
   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
@@ -14,8 +14,13 @@ if command -v sha256sum; then
 else
   echo "WARNING: could not verify checksum, please install sha256sum" >&2
 fi
-./scripts/update_eigen.sh "${ARC}"
+./scripts/update_eigen.sh ${ARC}
-rm "${ARC}"
+rm ${ARC}
 # patch for non-portable includes in Eigen 3.3.5
 # apparently already fixed in Eigen HEAD so it should not be 
 # a problem in the future (A.P.)
 patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
--- a/examples/Example_plaquette.cc
+++ b/examples/Example_plaquette.cc
@@ -1,183 +0,0 @@
 /* 
 * Example_plaquette.cc                                                               
 * 
 * D. Clarke 
 * 
 * Here I just want to create an incredibly simple main to get started with GRID and get used
 * to its syntax. If the reader is like me, they vaguely understand something about lattice coding,
 * they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID.
 *
 * Once you've made a new executable, like this one, you can bootstrap.sh again. At this point,
 * the code should be able to find your new executable. You can tell that bootstrap.sh worked by
 * having a look at Make.inc. You should see your executable inside there.
 *
 * Warning: This code illustrative only, not well tested, and not meant for production use. The best
 * way to read this code is to start at the main.
 * 
 */
 // All your mains should have this
 #include <Grid/Grid.h>
 using namespace Grid;
 // This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in
 // detail what everything does so we can see how GRID works.
 template <class Gimpl> class WLoops : public Gimpl {
 public:
    // Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are
    // already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that
    // implement this equivalence at compile time.
    INHERIT_GIMPL_TYPES(Gimpl);
    // Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built
    // out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base
    // field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which
    // are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names
    // indicates that they should be for internal use only. It seems like these base types have the
    // acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things
    // appears to be controlled through a template parameter called vtype.
    // The general math/physics objects, such as a color matrix, are built up by nesting these objects.
    // For instance a general color matrix has two color indices, so it's built up like
    //     iScalar<iScalar<iMatrix<vtype ...
    // where the levels going from the inside out are color, spin, then Lorentz indices. Scalars have
    // no indices, so it's what we use when such an index isn't needed. Lattice objects are made by one
    // higher level of indexing using iVector.
    // These types will be used for U and U_mu objects, respectively.
    typedef typename Gimpl::GaugeLinkField GaugeMat;
    typedef typename Gimpl::GaugeField GaugeLorentz;
    // U_mu_nu(x)
    static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
        // Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together
        // the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward
        // has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link. 
        plaq = Gimpl::CovShiftForward(U[mu],mu,
                   // Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order.
                   Gimpl::CovShiftForward(U[nu],nu,
                       Gimpl::CovShiftBackward(U[mu],mu,
                           // This means Cshift(adj(Link), mu, -1)
                           Gimpl::CovShiftIdentityBackward(U[nu], nu))));
    }
    // tr U_mu_nu(x)
    static void traceDirPlaquette(ComplexField &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
        // This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument
        // to instantiate a Lattice object.
        GaugeMat sp(U[0].Grid());
        dirPlaquette(sp, U, mu, nu);
        plaq = trace(sp);
    }
    // sum_mu_nu tr U_mu_nu(x)
    static void sitePlaquette(ComplexField &Plaq, const std::vector<GaugeMat> &U) {
        ComplexField sitePlaq(U[0].Grid());
        Plaq = Zero();
        // Nd=4 and Nc=3 are set as global constants in QCD.h
        for (int mu = 1; mu < Nd; mu++) {
            for (int nu = 0; nu < mu; nu++) {
                traceDirPlaquette(sitePlaq, U, mu, nu);
                Plaq = Plaq + sitePlaq;
            }
        }
    }
    // sum_mu_nu_x Re tr U_mu_nu(x)
    static RealD sumPlaquette(const GaugeLorentz &Umu) {
        std::vector<GaugeMat> U(Nd, Umu.Grid());
        for (int mu = 0; mu < Nd; mu++) {
            // Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can
            // access the element in the mu Lorentz index with this PeekIndex syntax.
            U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
        }
        ComplexField Plaq(Umu.Grid());
        sitePlaquette(Plaq, U);
        // I guess this should be the line that sums over all space-time sites.
        auto Tp = sum(Plaq);
        // Until now, we have been working with objects inside the tensor nest. This TensorRemove gets
        // rid of the tensor nest to return whatever is inside.
        auto p  = TensorRemove(Tp);
        return p.real();
    }
    // < Re tr U_mu_nu(x) >
    static RealD avgPlaquette(const GaugeLorentz &Umu) {
        // Real double type
        RealD sumplaq = sumPlaquette(Umu);
        // gSites() is the number of global sites. there is also lSites() for local sites.
        double vol = Umu.Grid()->gSites();
        // The number of orientations. 4*3/2=6 for Nd=4, as known.
        double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
        return sumplaq / vol / faces / Nc;
    }
 };
 // Next we show an example of how to construct an input parameter class. We first inherit
 // from Serializable. Then all class data members have to be defined using the
 // GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many
 // class data members. In the below case, we make a parameter file holding the configuration
 // name. Here, it expects the name to be labeled with "conf_name" in the configuration file. 
 struct ConfParameters: Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(
        ConfParameters,
        std::string, conf_name);
    template <class ReaderClass>
    ConfParameters(Reader<ReaderClass>& Reader){
        // If we are reading an XML file, it should be structured like:
        // <grid>
        //   <parameters>
        //     <conf_name>l20t20b06498a_nersc.302500</conf_name>
        //   </parameters>
        // </grid>
        read(Reader, "parameters", *this);
    }
 };
 // This syntax lets you pass command line arguments to main. An asterisk means that what follows is
 // a pointer. Two asterisks means what follows is a pointer to an array. 
 int main (int argc, char **argv)
 {
    // This initializes Grid. Some command line options include
    //   --mpi n.n.n.n
    //   --threads n
    //   --grid n.n.n.n
    Grid_init(&argc, &argv);
    // This is where you would specify a custom lattice size, if not from the command line. Here
    // Nd is a global quantity that is currently set to 4.
    Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
    Coordinate mpi_layout  = GridDefaultMpi();
    Coordinate latt_size   = GridDefaultLatt();
    // Instantiate the spacetime Grid on which everything will be built.
    GridCartesian GRID(latt_size,simd_layout,mpi_layout);
    // The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD
    // type that you can use, which will work perfectly with what follows. 
    PeriodicGimplD::Field U(&GRID);
    // Here we read in the parameter file params.json to get conf_name. The last argument is what the
    // top organizational level is called in the param file. 
    XmlReader Reader("Example_plaquette.xml",false, "grid");
    ConfParameters param(Reader);  
    // Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717
    FieldMetaData header;
    NerscIO::readConfiguration(U, header, param.conf_name);
    // Let's see what we find.
    RealD plaq = WLoops<PeriodicGimplD>::avgPlaquette(U);
    // This is how you make log messages.
    std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) << "Plaquette = " << plaq << std::endl;
    // To wrap things up.
    Grid_finalize();
 }
--- a/scripts/eigen-3.3.5.Tensor.patch
+++ b/scripts/eigen-3.3.5.Tensor.patch
@@ -0,0 +1,19 @@
 --- ./Eigen/unsupported/Eigen/CXX11/Tensor	2018-07-23 10:33:42.000000000 +0100
 +++ Tensor	2018-08-28 16:15:56.000000000 +0100
@@ -25,7 +25,7 @@
 #include <utility>
 #endif
 -#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "../SpecialFunctions"
 #include "src/util/CXX11Meta.h"
@@ -147,6 +147,6 @@
 #include "src/Tensor/TensorIO.h"
 -#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 //#endif // EIGEN_CXX11_TENSOR_MODULE
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@@ -1,60 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=1024
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 export FI_CXI_CQ_FILL_PERCENT=10
 export FI_CXI_DEFAULT_CQ_SIZE=262144
 #export FI_CXI_DEFAULT_CQ_SIZE=131072
 #export FI_CXI_CQ_FILL_PERCENT=20
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1024node.dwf.small.cq
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1024node.dwf.cq
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -1,60 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 #$CMD 
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 #$CMD 
 CMD="mpiexec -np 1 -ppn 1  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 1 -ppn 1  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
--- a/systems/Aurora/benchmarks/bench2048.pbs
+++ b/systems/Aurora/benchmarks/bench2048.pbs
@@ -1,56 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2048
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 24576 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 24576 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2048node.dwf.small
 CMD="mpiexec -np 24576 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2048node.dwf
--- a/systems/Aurora/benchmarks/bench256.pbs
+++ b/systems/Aurora/benchmarks/bench256.pbs
@@ -1,48 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=256
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 3072 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 3072 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 256node.dwf.large
--- a/systems/Aurora/benchmarks/bench512.pbs
+++ b/systems/Aurora/benchmarks/bench512.pbs
@@ -1,48 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=512
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 6144 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 6144 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 512node.dwf.large
--- a/systems/Aurora/benchmarks/bench_scaling.pbs
+++ b/systems/Aurora/benchmarks/bench_scaling.pbs
@@ -1,80 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=32
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 32 nodes, 384 ranks
 #
 CMD="mpiexec -np 384 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 12 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 1node.dwf
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 2node.dwf
 CMD="mpiexec -np 48 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 4node.dwf
 CMD="mpiexec -np 96 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 8node.dwf
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 16node.dwf
 CMD="mpiexec -np 384 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 $CMD | tee 32node.dwf
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -1,33 +0,0 @@
 #!/bin/bash
 export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 #export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 #export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 #echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 numactl -m $NUMA -N $NUMAP  "$@"
--- a/systems/Aurora/benchmarks/gpu_tile_compact4.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
@@ -1,29 +0,0 @@
 #!/bin/bash
 export  NUMA_MAP=(2 2 3 3  2 2  3 3  )
 export  PROC_MAP=(0 0 1 1  0 0  1 1  )
 export  NIC_MAP=(0 0  4 4  1 1  5 5  )
 export  GPU_MAP=(0 1  3 4  0 1  3 4  )
 export TILE_MAP=(0 0  0 0  1 1  1 1  )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 #export GRID_MPICH_NIC_BIND=$NIC
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
 numactl -m $NUMA -N $PROC_MAP  "$@"
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,16 +0,0 @@
 TOOLS=$HOME/tools
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
--- a/systems/Aurora/proxies.sh
+++ b/systems/Aurora/proxies.sh
@@ -1,9 +0,0 @@
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -1,26 +0,0 @@
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
 export FI_CXI_DEFAULT_CQ_SIZE=131072
 export FI_CXI_CQ_FILL_PERCENT=20
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
 #
 # -ftarget-register-alloc-mode=pvc:default 
 # -ftarget-register-alloc-mode=pvc:small
 # -ftarget-register-alloc-mode=pvc:large
 # -ftarget-register-alloc-mode=pvc:auto
 #
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Aurora/tests/repro16.pbs
+++ b/systems/Aurora/tests/repro16.pbs
@@ -1,40 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=16
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 16 nodes, 192 ranks
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
 $CMD 
--- a/systems/Aurora/tests/solver/stag16.pbs
+++ b/systems/Aurora/tests/solver/stag16.pbs
@@ -1,40 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=16
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 16 nodes, 192 ranks
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
 	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
 $CMD 
--- a/systems/Booster/benchmarks/Benchmark_usqcd.csv
+++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv
@@ -1,70 +0,0 @@
 Memory Bandwidth
 Bytes, GB/s per node
 3145728, 225.900365
 50331648, 2858.859504
 254803968, 4145.556367
 805306368, 4905.772480
 1966080000, 4978.312557
 GEMM
 M, N, K, BATCH, GF/s per rank
 16, 8, 16, 256, 1.713639
 16, 16, 16, 256, 288.268316
 16, 32, 16, 256, 597.053950
 32, 8, 32, 256, 557.382591
 32, 16, 32, 256, 1100.145311
 32, 32, 32, 256, 1885.080449
 64, 8, 64, 256, 1725.163599
 64, 16, 64, 256, 3389.336566
 64, 32, 64, 256, 4168.252422
 16, 8, 256, 256, 1326.262134
 16, 16, 256, 256, 2318.095475
 16, 32, 256, 256, 3555.436503
 32, 8, 256, 256, 1920.139170
 32, 16, 256, 256, 3486.174753
 32, 32, 256, 256, 5320.821724
 64, 8, 256, 256, 2539.597502
 64, 16, 256, 256, 5003.456775
 64, 32, 256, 256, 7837.531562
 8, 256, 16, 256, 1427.848170
 16, 256, 16, 256, 2222.147815
 32, 256, 16, 256, 2877.121715
 8, 256, 32, 256, 1922.890086
 16, 256, 32, 256, 3199.469082
 32, 256, 32, 256, 4845.405343
 8, 256, 64, 256, 2639.483343
 16, 256, 64, 256, 5012.800299
 32, 256, 64, 256, 7216.006882
 Communications
 Packet bytes, direction, GB/s per node
 4718592, 2, 206.570734
 4718592, 3, 207.501847
 4718592, 6, 189.730277
 4718592, 7, 204.301218
 15925248, 2, 307.882997
 15925248, 3, 287.901076
 15925248, 6, 295.603109
 15925248, 7, 300.682033
 37748736, 2, 331.740364
 37748736, 3, 338.610627
 37748736, 6, 332.580657
 37748736, 7, 336.336579
 Per node summary table
 L , Wilson, DWF4, Staggered, GF/s per node
 8 , 16, 1165, 10
 12 , 473, 4901, 163
 16 , 1436, 8464, 442
 24 , 4133, 10139, 1530
 32 , 5726, 11487, 2518
--- a/systems/Booster/config-command
+++ b/systems/Booster/config-command
@@ -5,12 +5,10 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
    --enable-gen-simd-width=64 \
    --enable-shm=nvlink \
    --enable-accelerator=cuda \
    --disable-gparity \
    --disable-fermion-reps \
    --with-lime=$LIME \
-    --enable-accelerator-cshift \
+    --disable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
--- a/systems/Booster/sourceme.sh
+++ b/systems/Booster/sourceme.sh
@@ -1,5 +1,5 @@
-module load GCC
+module load GCC/9.3.0       
-module load GMP
+module load  GMP/6.2.0   
-module load MPFR
+module load MPFR/4.1.0     
-module load OpenMPI
+module load OpenMPI/4.1.0rc1  
-module load CUDA
+module load CUDA/11.3
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@@ -1,23 +0,0 @@
 CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 ../../configure --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
 --enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
 LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64  -lhipblas -lrocblas"
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@@ -1,13 +0,0 @@
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
 spack load c-lime
 #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
 module load emacs 
 module load PrgEnv-gnu
 module load rocm
 module load cray-mpich/8.1.23
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
 #export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
--- a/systems/Lumi/HMC/32cube/fthmc3gev.slurm
+++ b/systems/Lumi/HMC/32cube/fthmc3gev.slurm
@@ -1,57 +0,0 @@
 #!/bin/bash -l
 #SBATCH --job-name=fthmc3ge
 #SBATCH --partition=small-g
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=8
 ##SBATCH --cpus-per-task=8
 #SBATCH --gpus-per-node=8
 #SBATCH --time=2:00:00
 #SBATCH --account=project_465000546
 #SBATCH --gpu-bind=none
 #SBATCH --exclusive
 #SBATCH --mem=0
 #sbatch --dependency=afterany:$SLURM_JOBID fthmc3gev.slurm
 CPU_BIND="map_ldom:3,3,1,1,0,0,2,2"
 MEM_BIND="map_mem:3,3,1,1,0,0,2,2"
 echo $CPU_BIND
 cat << EOF > ./select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3 4 5 6 7)
 export NUMA_MAP=(3 3 1 1 0 0 2 2)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export NUM=\${NUMA_MAP[\$SLURM_LOCALID]}
 #export HIP_VISIBLE_DEVICES=\$GPU
 export ROCR_VISIBLE_DEVICES=\$GPU
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 echo NUMA \$SLURM_LOCALID using NUMA \${NUM}
 echo numactl -m \$NUM -N \$NUM \$*
 exec numactl -m \$NUM -N \$NUM \$*
 EOF
 cat ./select_gpu
 chmod +x ./select_gpu
 root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
 source ${root}/sourceme.sh
 export OMP_NUM_THREADS=7
 export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export MPICH_GPU_SUPPORT_ENABLED=1
 #cfg=`ls -rt ckpoint_*lat* | tail -n 1  `
 #traj="${cfg#*.}"
 #cfg=`ls -rt ckpoint_*lat* | tail -n 1  `
 traj=0
 vol=32.32.32.64
 mpi=1.2.2.2
 PARAMS="--mpi $mpi --accelerator-threads 16 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol"
 #HMCPARAMS="--StartingType CheckpointStart --StartingTrajectory $traj --Trajectories 200"
 HMCPARAMS="--StartingType ColdStart --StartingTrajectory $traj --Trajectories 20"
 srun ./select_gpu ../FTHMC2p1f_3GeV $HMCPARAMS $PARAMS
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@@ -23,7 +23,7 @@ echo mpfr X$MPFR
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
-  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
+  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
  LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" 
--- a/systems/PVC-OEM/setup.sh
+++ b/systems/PVC-OEM/setup.sh
@@ -1,5 +1,3 @@
 export https_proxy=http://proxy-chain.intel.com:911
 module load intel-release
 module load intel/mpich
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/SDCC-A100/bench.slurm
+++ b/systems/SDCC-A100/bench.slurm
@@ -1,42 +0,0 @@
 #!/bin/bash
 #SBATCH --partition csi
 #SBATCH --time=00:10:00
 #SBATCH -A csigeneral
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --qos csi
 #SBATCH --gres=gpu:4
 source sourceme.sh
 cat << EOF > select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export CUDA_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec \$*
 EOF
 chmod +x ./select_gpu
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=no
 export UCX_MEMTYPE_CACHE=n
 export OMP_NUM_THREAD=8
 #srun -N1 -n1 nvidia-smi
 #srun -N1 -n1 numactl -H > numa.txt
 srun -N1 -n1 lstopo A100-topo.pdf
 # 4.35 TF/s
 #srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0  --accelerator-threads 16
 srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0  --accelerator-threads 16
--- a/systems/SDCC-A100/config-command
+++ b/systems/SDCC-A100/config-command
@@ -1,17 +0,0 @@
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=cuda \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=nvcc \
 MPICXX=mpicxx \
 LDFLAGS="-cudart shared " \
 CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"
--- a/systems/SDCC-A100/sourceme.sh
+++ b/systems/SDCC-A100/sourceme.sh
@@ -1,2 +0,0 @@
 module load cuda/12.2
 module load openmpi
--- a/systems/SDCC-ARM/config-command-mpi
+++ b/systems/SDCC-ARM/config-command-mpi
@@ -1,6 +0,0 @@
 HDF=$HOME/paboyle/install
 LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF 
 #LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF 
--- a/systems/SDCC-ICE/bench.slurm
+++ b/systems/SDCC-ICE/bench.slurm
@@ -1,31 +0,0 @@
 #!/bin/bash
 #SBATCH --partition lqcd
 #SBATCH --time=00:20:00
 #SBATCH -A lqcdtest
 #SBATCH --exclusive
 #SBATCH --nodes=1
 #SBATCH --ntasks=2
 #SBATCH --qos lqcd
 source sourceme.sh
 export OMP_NUM_THREAD=24
 #srun -N1 -n1 numactl -H > numa.txt
 #srun -N1 -n1 lstopo ice-topo.pdf
 cat << EOF > select_socket
 #!/bin/bash
 export NUM_MAP=(0 1)
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 exec \$*
 EOF
 chmod +x ./select_socket
 #for vol in 8.8.8.16 8.8.8.32 8.8.8.64
 #for vol in 8.8.16.16 8.8.16.32 8.8.16.64
 for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32
 do
 srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out
 srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out
 done
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chulwoo Jung	cfa0576ffd	Getting rid of one more non-auto View, comms overlap in Laplace operator	2024-02-25 22:37:48 -05:00
Chulwoo Jung	fe98e9f555	Fixing Laplace flopcount Minor cleanup	2024-02-13 12:06:08 -05:00
Chulwoo Jung	948d16fb06	Laplace benchmark added	2024-02-12 21:23:36 -05:00
Chulwoo Jung	58fbcaa399	Checking in before cleaning up	2024-02-12 21:10:21 -05:00
Chulwoo Jung	9ad6836b0f	Mixed precision for Laplace. Main program with Metric	2024-02-08 17:13:10 -05:00
Chulwoo Jung	026eb8a695	Wilson RMHMC main program	2023-12-12 15:34:03 -05:00
Chulwoo Jung	076580c232	Recovering mixed precision CG for Laplace Checking in to move to aurora	2023-12-12 15:32:00 -05:00
Chulwoo Jung	7af6022a2a	Added midMD checkpointing (for lattice only for now)	2023-12-04 20:05:41 -05:00
Chulwoo Jung	982a60536c	Checking in before forking	2023-11-22 16:33:15 -05:00
Chulwoo Jung	dc36d272ce	Gauge RMHMC conserving dH	2023-11-21 13:48:51 -05:00
Chulwoo Jung	515ff6bf62	Added Laplacian metric, Gauge OpenBC	2023-11-09 21:42:46 -05:00
		`@@ -1,2 +0,0 @@`
			`module load cuda/12.2`
			`module load openmpi`