Don't need the numerical rounding tolerance in multigrid

Synchronise changes
Speed up the coarsened matrix matrix evaluation.
2025-10-31 03:54:33 +00:00 · 2023-12-22 18:10:23 -05:00 · 2023-12-22 18:09:11 -05:00 · 2023-12-22 18:07:03 -05:00 · 2023-12-22 18:06:13 -05:00 · 2023-12-22 18:05:41 -05:00
159 changed files with 5421 additions and 7625 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,3 @@
-# Doxygen stuff
-html/*
-latex/*
-
 # Compiled Object files #
 #########################
 *.slo
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,7 +34,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-#undef EIGEN_USE_SYCL
+//#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif

--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -69,7 +69,8 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/PowerMethod.h>

 NAMESPACE_CHECK(PowerMethod);
-#include <Grid/algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/multigrid/MultiGrid.h>
+
 NAMESPACE_CHECK(CoarsendMatrix);
 #include <Grid/algorithms/FFT.h>

--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #define _GRID_FFT_H_

 #ifdef HAVE_FFTW
-#if defined(USE_MKL) || defined(GRID_SYCL)
+#ifdef USE_MKL
 #include <fftw/fftw3.h>
 #else
 #include <fftw3.h>
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -145,6 +145,44 @@ public:
  }
 };

+////////////////////////////////////////////////////////////////////
+// Create a shifted HermOp
+////////////////////////////////////////////////////////////////////
+template<class Field>
+class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
+  LinearOperatorBase<Field> &_Mat;
+  RealD _shift;
+public:
+  ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    assert(0);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    assert(0);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    _Mat.HermOp(in,out);
+    out = out + _shift*in;
+  }
+};
+
+
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -90,9 +90,8 @@ public:
    order=_order;
      
    if(order < 2) exit(-1);
-    Coeffs.resize(order);
-    Coeffs.assign(0.,order);
-    Coeffs[order-1] = 1.;
+    Coeffs.resize(order,0.0);
+    Coeffs[order-1] = 1.0;
  };
  
  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
--- a/Grid/algorithms/approx/MultiShiftFunction.h
+++ b/Grid/algorithms/approx/MultiShiftFunction.h
@@ -40,7 +40,7 @@ public:
  RealD norm;
  RealD lo,hi;

-  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */

-zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  construct_partfrac(d);
  construct_contfrac(d);

-  /* Converting everything to ZOLO_PRECISION for external use only */
+  /* Converting everything to PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (ZOLO_PRECISION) d -> A;
-  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
-  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }


-zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
+zolotarev_data* higham(PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  /* Converting everything to PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (ZOLO_PRECISION) d -> A;
-  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
-  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST

 #undef ZERO
-#define ZERO ((ZOLO_PRECISION) 0)
+#define ZERO ((PRECISION) 0)
 #undef ONE
-#define ONE ((ZOLO_PRECISION) 1)
+#define ONE ((PRECISION) 1)
 #undef TWO
-#define TWO ((ZOLO_PRECISION) 2)
+#define TWO ((PRECISION) 2)

 /* Evaluate the rational approximation R(x) using the factored form */

-static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R;
+  PRECISION R;

  if (rdata -> type == 0) {
    R = rdata -> A * x;
@@ -551,9 +551,9 @@ static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {

 /* Evaluate the rational approximation R(x) using the partial fraction form */

-static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
+  PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data*
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */

-static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION R = rdata -> beta[0] * x;
+  PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    

 /* Evaluate the rational approximation R(x) using Cayley form */

-static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
+static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
  int m;
-  ZOLO_PRECISION T;
+  PRECISION T;

  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
-  ZOLO_PRECISION y;
+  PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;

@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
  }

  rdata = type == 2 
-    ? higham((ZOLO_PRECISION) eps, n) 
-    : zolotarev((ZOLO_PRECISION) eps, n, type);
+    ? higham((PRECISION) eps, n) 
+    : zolotarev((PRECISION) eps, n, type);

  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
+	 "\tPRECISION = " STRINGIFY(PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
+      y = zolotarev_eval((PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>

 #ifndef ZOLOTAREV_INTERNAL
-#ifndef ZOLO_PRECISION
-#define ZOLO_PRECISION double
+#ifndef PRECISION
+#define PRECISION double
 #endif
-#define ZPRECISION ZOLO_PRECISION
+#define ZPRECISION PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif

@@ -77,8 +77,8 @@ typedef struct {
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */

-ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif

@@ -86,4 +86,3 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
-
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@@ -33,218 +33,413 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Script A = SolverMatrix 
   * Script P = Preconditioner
   *
-   * Deflation methods considered
-   *      -- Solve P A x = P b        [ like Luscher ]
-   * DEF-1        M P A x = M P b     [i.e. left precon]
-   * DEF-2        P^T M A x = P^T M b
-   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
-   * ADEF-2       Preconditioner = P^T M + Q
-   * BNN          Preconditioner = P^T M P + Q
-   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
-   * 
   * Implement ADEF-2
   *
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
-   * Vout = x
   */
+NAMESPACE_BEGIN(Grid);

-// abstract base
-template<class Field, class CoarseField>
-class TwoLevelFlexiblePcg : public LinearFunction<Field>
+template<class Field>
+class TwoLevelCG : public LinearFunction<Field>
 {
 public:
-  int verbose;
  RealD   Tolerance;
  Integer MaxIterations;
-  const int mmax = 5;
  GridBase *grid;
-  GridBase *coarsegrid;

-  LinearOperatorBase<Field>   *_Linop
-  OperatorFunction<Field>     *_Smoother,
-  LinearFunction<CoarseField> *_CoarseSolver;
-
-  // Need somthing that knows how to get from Coarse to fine and back again
+  // Fine operator, Smoother, CoarseSolver
+  LinearOperatorBase<Field>   &_FineLinop;
+  LinearFunction<Field>   &_Smoother;
  
  // more most opertor functions
-  TwoLevelFlexiblePcg(RealD tol,
-		     Integer maxit,
-		     LinearOperatorBase<Field> *Linop,
-		     LinearOperatorBase<Field> *SmootherLinop,
-		     OperatorFunction<Field>   *Smoother,
-		     OperatorFunction<CoarseField>  CoarseLinop
-		     ) : 
+  TwoLevelCG(RealD tol,
+	     Integer maxit,
+	     LinearOperatorBase<Field>   &FineLinop,
+	     LinearFunction<Field>       &Smoother,
+	     GridBase *fine) : 
      Tolerance(tol), 
      MaxIterations(maxit),
-      _Linop(Linop),
-      _PreconditionerLinop(PrecLinop),
-      _Preconditioner(Preconditioner)
+      _FineLinop(FineLinop),
+      _Smoother(Smoother)
  {
-    verbose=0;
+    grid       = fine;
  };
  
-  // The Pcg routine is common to all, but the various matrices differ from derived 
-  // implementation to derived implmentation
-  void operator() (const Field &src, Field &psi){
-  void operator() (const Field &src, Field &psi){
-
-    psi.Checkerboard() = src.Checkerboard();
-    grid             = src.Grid();
-
+  virtual void operator() (const Field &src, Field &x)
+  {
+#if 0
+    Field resid(grid);
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
-    RealD tn;
-    RealD guess = norm2(psi);
-    RealD ssq   = norm2(src);
-    RealD rsq   = ssq*Tolerance*Tolerance;
    
-    /////////////////////////////
-    // Set up history vectors
-    /////////////////////////////
-    std::vector<Field> p  (mmax,grid);
-    std::vector<Field> mmp(mmax,grid);
-    std::vector<RealD> pAp(mmax);
-
-    Field x  (grid); x = psi;
-    Field z  (grid);
+    Field p(grid);
+    Field z(grid);
    Field tmp(grid);
+    Field mmp(grid);
    Field r  (grid);
    Field mu (grid);
+    Field rp (grid);
    
+    //Initial residual computation & set up
+    double tn;
+
+    GridStopWatch HDCGTimer;
+    HDCGTimer.Start();
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
-    x=src;
+    x=Zero();
    Vstart(x,src);

    // r0 = b -A x0
-    HermOp(x,mmp); // Shouldn't this be something else?
-    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+    _FineLinop.HermOp(x,mmp);
+
+    axpy(r, -1.0, mmp, src);    // Recomputes r=src-x0
+    rp=r;

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    M1(r,z,tmp,mp,SmootherMirs);
+    PcgM1(r,z);
    rtzp =real(innerProduct(r,z));

    ///////////////////////////////////////
-    // Solve for Mss mu = P A z and set p = z-mu
-    // Def2: p = 1 - Q Az = Pright z 
-    // Other algos M2 is trivial
+    // Except Def2, M2 is trivial
    ///////////////////////////////////////
-    M2(z,p[0]);
+    p=z;

-    for (int k=0;k<=MaxIterations;k++){
+    RealD ssq =  norm2(src);
+    RealD rsq =  ssq*Tolerance*Tolerance;

-      int peri_k  = k % mmax;
-      int peri_kp = (k+1) % mmax;
+    GRID_TRACE("MultiGrid TwoLevel ");
+    std::cout<<GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" target rsq "<<rsq<<" ssq "<<ssq<<std::endl;
+    
+    for (int k=1;k<=MaxIterations;k++){

      rtz=rtzp;
-      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
+      d= PcgM3(p,mmp);
      a = rtz/d;

-      // Memorise this
-      pAp[peri_k] = d;
+      axpy(x,a,p,x);
+      RealD rn = axpy_norm(r,-a,mmp,r);

-      axpy(x,a,p[peri_k],x);
-      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
-
-      // Compute z = M x
-      M1(r,z,tmp,mp);
+      PcgM1(r,z);

      rtzp =real(innerProduct(r,z));

-      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
-
-      p[peri_kp]=p[peri_k];
-
-      // Standard search direction  p -> z + b p    ; b = 
-      b = (rtzp)/rtz;
-
-      int northog;
-      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
-      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
-    
-      for(int back=0; back < northog; back++){
-	int peri_back = (k-back)%mmax;
-	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
-	RealD beta = -pbApk/pAp[peri_back];
-	axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
+      int ipcg=1; // almost free inexact preconditioned CG
+      if (ipcg) {
+	rptzp =real(innerProduct(rp,z));
+      } else {
+	rptzp =0;
      }
+      b = (rtzp-rptzp)/rtz;
+
+      PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+
+      axpy(p,b,p,mu);  // mu = A r

      RealD rrn=sqrt(rn/ssq);
-      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
+      RealD rtn=sqrt(rtz/ssq);
+      std::cout<<GridLogMessage<<"HDCG: Pcg k= "<<k<<" residual = "<<rrn<<std::endl;
+
+      if ( ipcg ) {
+	axpy(rp,0.0,r,r);
+      }

      // Stopping condition
      if ( rn <= rsq ) { 

-	HermOp(x,mmp); // Shouldn't this be something else?
-	axpy(tmp,-1.0,src,mmp[0]);
+	HDCGTimer.Stop();
+	std::cout<<GridLogMessage<<"HDCG: Pcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;

-	RealD psinorm = sqrt(norm2(x));
-	RealD srcnorm = sqrt(norm2(src));
-	RealD tmpnorm = sqrt(norm2(tmp));
-	RealD true_residual = tmpnorm/srcnorm;
-	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
-	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
-	return k;
+	_FineLinop.HermOp(x,mmp);			  
+	axpy(tmp,-1.0,src,mmp);
+
+	RealD  mmpnorm = sqrt(norm2(mmp));
+	RealD  xnorm   = sqrt(norm2(x));
+	RealD  srcnorm = sqrt(norm2(src));
+	RealD  tmpnorm = sqrt(norm2(tmp));
+	RealD  true_residual = tmpnorm/srcnorm;
+	std::cout<<GridLogMessage
+		 <<"HDCG: true residual is "<<true_residual
+		 <<" solution "<<xnorm
+		 <<" source "<<srcnorm
+		 <<" mmp "<<mmpnorm	  
+		 <<std::endl;
+
+	return;
      }
+
    }
-    // Non-convergence
-    assert(0);
+    std::cout<<GridLogMessage<<"HDCG: not converged"<<std::endl;
+    RealD  xnorm   = sqrt(norm2(x));
+    RealD  srcnorm = sqrt(norm2(src));
+    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+    
+    return ;
+#else
+  RealD f;
+  RealD rtzp,rtz,a,d,b;
+  RealD rptzp;
+
+  /////////////////////////////
+  // Set up history vectors
+  /////////////////////////////
+  int mmax = 20;
+  std::vector<Field> p(mmax,grid);
+  std::vector<Field> mmp(mmax,grid);
+  std::vector<RealD> pAp(mmax);
+  Field z(grid);
+  Field tmp(grid);
+  Field  mp (grid);
+  Field  r  (grid);
+  Field  mu (grid);
+
+  //Initial residual computation & set up
+  RealD guess   = norm2(x);
+  RealD src_nrm = norm2(src);
+
+  if ( src_nrm == 0.0 ) {
+    std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
+    x=Zero();
  }
+  RealD tn;
+
+  GridStopWatch HDCGTimer;
+  HDCGTimer.Start();
+  //////////////////////////
+  // x0 = Vstart -- possibly modify guess
+  //////////////////////////
+  Vstart(x,src);
+
+  // r0 = b -A x0
+  _FineLinop.HermOp(x,mmp[0]);
+  axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+  {
+    double n1 = norm2(x);
+    double n2 = norm2(mmp[0]);
+    double n3 = norm2(r);
+    std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
+  }
+
+  //////////////////////////////////
+  // Compute z = M1 x
+  //////////////////////////////////
+  PcgM1(r,z);
+  rtzp =real(innerProduct(r,z));
+
+  ///////////////////////////////////////
+  // Solve for Mss mu = P A z and set p = z-mu
+  // Def2: p = 1 - Q Az = Pright z 
+  // Other algos M2 is trivial
+  ///////////////////////////////////////
+  PcgM2(z,p[0]);
+
+  RealD ssq =  norm2(src);
+  RealD rsq =  ssq*Tolerance*Tolerance;
+
+  std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
+
+  Field pp(grid);
+
+  for (int k=0;k<=MaxIterations;k++){
+    
+    int peri_k  = k % mmax;
+    int peri_kp = (k+1) % mmax;
+
+    rtz=rtzp;
+    d= PcgM3(p[peri_k],mmp[peri_k]);
+    a = rtz/d;
+    
+    // Memorise this
+    pAp[peri_k] = d;
+
+    
+    axpy(x,a,p[peri_k],x);
+    RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
+
+    // Compute z = M x
+    PcgM1(r,z);
+
+    {
+      RealD n1,n2;
+      n1=norm2(r);
+      n2=norm2(z);
+      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
+    }
+    rtzp =real(innerProduct(r,z));
+    std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";
+
+    //    PcgM2(z,p[0]);
+    PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+
+    p[peri_kp]=mu;
+
+    // Standard search direction  p -> z + b p    ; b = 
+    b = (rtzp)/rtz;
+
+    int northog;
+    // k=zero  <=> peri_kp=1;        northog = 1
+    // k=1     <=> peri_kp=2;        northog = 2
+    // ...               ...                  ...
+    // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
+    // k=mmax-1<=> peri_kp=0;        northog = 1
+
+    //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
+    northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+    
+    std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
+    for(int back=0; back < northog; back++){
+      int peri_back = (k-back)%mmax;
+      RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
+      RealD beta = -pbApk/pAp[peri_back];
+      axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
+    }
+
+    RealD rrn=sqrt(rn/ssq);
+    RealD rtn=sqrt(rtz/ssq);
+    RealD rtnp=sqrt(rtzp/ssq);
+
+    std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
+
+    // Stopping condition
+    if ( rn <= rsq ) { 
+
+      HDCGTimer.Stop();
+      std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+      
+      _FineLinop.HermOp(x,mmp[0]);			  
+      axpy(tmp,-1.0,src,mmp[0]);
+      
+      RealD  mmpnorm = sqrt(norm2(mmp[0]));
+      RealD  xnorm   = sqrt(norm2(x));
+      RealD  srcnorm = sqrt(norm2(src));
+      RealD  tmpnorm = sqrt(norm2(tmp));
+      RealD  true_residual = tmpnorm/srcnorm;
+      std::cout<<GridLogMessage
+	       <<"HDCG: true residual is "<<true_residual
+	       <<" solution "<<xnorm
+	       <<" source "<<srcnorm
+	       <<" mmp "<<mmpnorm	  
+	       <<std::endl;
+      
+      return;
+    }
+
+  }
+  HDCGTimer.Stop();
+  std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
+  RealD  xnorm   = sqrt(norm2(x));
+  RealD  srcnorm = sqrt(norm2(src));
+  std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+#endif
+  }
+  

 public:

-  virtual void M(Field & in,Field & out,Field & tmp) {
+  virtual void PcgM1(Field & in, Field & out)     =0;
+  virtual void Vstart(Field & x,const Field & src)=0;

+  virtual void PcgM2(const Field & in, Field & out) {
+    out=in;
  }

-  virtual void M1(Field & in, Field & out) {// the smoother
+  virtual RealD PcgM3(const Field & p, Field & mmp){
+    RealD dd;
+    _FineLinop.HermOp(p,mmp);
+    ComplexD dot = innerProduct(p,mmp);
+    dd=real(dot);
+    return dd;
+  }

+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout.
+  /////////////////////////////////////////////////////////////////////
+
+};
+  
+template<class Field, class CoarseField, class Aggregation>
+class TwoLevelADEF2 : public TwoLevelCG<Field>
+{
+ public:
+  ///////////////////////////////////////////////////////////////////////////////////
+  // Need something that knows how to get from Coarse to fine and back again
+  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+  ///////////////////////////////////////////////////////////////////////////////////
+  GridBase *coarsegrid;
+  Aggregation &_Aggregates;                    
+  LinearFunction<CoarseField> &_CoarseSolver;
+  LinearFunction<CoarseField> &_CoarseSolverPrecise;
+  ///////////////////////////////////////////////////////////////////////////////////
+  
+  // more most opertor functions
+  TwoLevelADEF2(RealD tol,
+		Integer maxit,
+		LinearOperatorBase<Field>    &FineLinop,
+		LinearFunction<Field>        &Smoother,
+		LinearFunction<CoarseField>  &CoarseSolver,
+		LinearFunction<CoarseField>  &CoarseSolverPrecise,
+		Aggregation &Aggregates
+		) :
+      TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
+      _CoarseSolver(CoarseSolver),
+      _CoarseSolverPrecise(CoarseSolverPrecise),
+      _Aggregates(Aggregates)
+  {
+    coarsegrid = Aggregates.CoarseGrid;
+  };
+
+  virtual void PcgM1(Field & in, Field & out)
+  {
+    GRID_TRACE("MultiGridPreconditioner ");
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    Field tmp(grid);
-    Field Min(grid);

-    PcgM(in,Min); // Smoother call
+    Field tmp(this->grid);
+    Field Min(this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);

-    HermOp(Min,out);
+    GridStopWatch SmootherTimer;
+    GridStopWatch MatrixTimer;
+    SmootherTimer.Start();
+    this->_Smoother(in,Min);
+    SmootherTimer.Stop();
+
+    MatrixTimer.Start();
+    this->_FineLinop.HermOp(Min,out);
+    MatrixTimer.Stop();
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min

-    ProjectToSubspace(tmp,PleftProj);     
-    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    GridStopWatch ProjTimer;
+    GridStopWatch CoarseTimer;
+    GridStopWatch PromTimer;
+    ProjTimer.Start();
+    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
+    ProjTimer.Stop();
+    CoarseTimer.Start();
+    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    CoarseTimer.Stop();
+    PromTimer.Start();
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    PromTimer.Stop();
+    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
+    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
+
    axpy(out,1.0,Min,tmp); // Min+tmp
  }

-  virtual void M2(const Field & in, Field & out) {
-    out=in;
-    // Must override for Def2 only
-    //  case PcgDef2:
-    //    Pright(in,out);
-    //    break;
-  }
-
-  virtual RealD M3(const Field & p, Field & mmp){
-    double d,dd;
-    HermOpAndNorm(p,mmp,d,dd);
-    return dd;
-    // Must override for Def1 only
-    //  case PcgDef1:
-    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
-    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
-    //    Pleft(mp,mmp);
-    //    d=real(linop_d->inner(p,mmp));
-  }
-
-  virtual void VstartDef2(Field & xconst Field & src){
-    //case PcgDef2:
-    //case PcgAdef2: 
-    //case PcgAdef2f:
-    //case PcgV11f:
+  virtual void Vstart(Field & x,const Field & src)
+  {
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@@ -256,142 +451,74 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(grid);
-    Field mmp(grid);
+    Field r(this->grid);
+    Field mmp(this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);

-    HermOp(x,mmp);
-    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
-    ProjectToSubspace(r,PleftProj);     
-    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    PromoteFromSubspace(PleftMss_proj,mmp);  
-    x=x+mmp;
+    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
+    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  

  }

+};
+
+template<class Field>
+class TwoLevelADEF1defl : public TwoLevelCG<Field>
+{
+public:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+  
+  TwoLevelADEF1defl(RealD tol,
+		   Integer maxit,
+		   LinearOperatorBase<Field>   &FineLinop,
+		   LinearFunction<Field>   &Smoother,
+		   std::vector<Field> &_evec,
+		   std::vector<RealD> &_eval) : 
+    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
+    evec(_evec),
+    eval(_eval)
+  {};
+
+  // Can just inherit existing M2
+  // Can just inherit existing M3
+
+  // Simple vstart - do nothing
  virtual void Vstart(Field & x,const Field & src){
-    return;
+    x=src; // Could apply Q
+  };
+
+  // Override PcgM1
+  virtual void PcgM1(Field & in, Field & out)
+  {
+    GRID_TRACE("EvecPreconditioner ");
+    int N=evec.size();
+    Field Pin(this->grid);
+    Field Qin(this->grid);
+
+    //MP  + Q = M(1-AQ) + Q = M
+    // // If we are eigenvector deflating in coarse space
+    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
+    // // A Q = Sum_i |phi_i> <phi_i|
+    // // M(1-AQ) = M(1-proj) + Q
+    Qin.Checkerboard()=in.Checkerboard();
+    Qin = Zero();
+    Pin = in;
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      auto ip = TensorRemove(innerProduct(tmp,in));
+      axpy(Qin, ip / eval[i],tmp,Qin);
+      axpy(Pin, -ip ,tmp,Pin);
+    }
+
+    this->_Smoother(Pin,out);
+
+    out = out + Qin;
  }
+};

-  /////////////////////////////////////////////////////////////////////
-  // Only Def1 has non-trivial Vout. Override in Def1
-  /////////////////////////////////////////////////////////////////////
-  virtual void   Vout  (Field & in, Field & out,Field & src){
-    out = in;
-    //case PcgDef1:
-    //    //Qb + PT x
-    //    ProjectToSubspace(src,PleftProj);     
-    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    //    PromoteFromSubspace(PleftMss_proj,tmp);  
-    //    
-    //    Pright(in,out);
-    //    
-    //    linop_d->axpy(out,tmp,out,1.0);
-    //    break;
-  }
+NAMESPACE_END(Grid);

-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // Pright and Pleft are common to all implementations
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  virtual void Pright(Field & in,Field & out){
-    // P_R  = [ 1              0 ] 
-    //        [ -Mss^-1 Msb    0 ] 
-    Field in_sbar(grid);
-
-    ProjectToSubspace(in,PleftProj);     
-    PromoteFromSubspace(PleftProj,out);  
-    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
-
-    HermOp(in_sbar,out);
-    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
-
-    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
-    PromoteFromSubspace(PleftMss_proj,out);     // 
-
-    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
-  }
-  virtual void Pleft (Field & in,Field & out){
-    // P_L  = [ 1  -Mbs Mss^-1] 
-    //        [ 0   0         ] 
-    Field in_sbar(grid);
-    Field    tmp2(grid);
-    Field    Mtmp(grid);
-
-    ProjectToSubspace(in,PleftProj);     
-    PromoteFromSubspace(PleftProj,out);  
-    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
-
-    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
-    PromoteFromSubspace(PleftMss_proj,out);
-
-    HermOp(out,Mtmp);
-
-    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
-    PromoteFromSubspace(PleftProj,tmp2);
-
-    axpy(out,-1.0,tmp2,Mtmp);
-    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
-  }
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp){
-
-  } 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
-
-  }
-  virtual void M2(Field & in, Field & out){
-
-  }
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
-
-  }
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
-
-  }
-}
-/*
-template<class Field>
-class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-*/
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -183,13 +183,13 @@ public:
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
+	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;

@@ -207,7 +207,8 @@ public:

    TrueResidual = sqrt(norm2(p)/ssq);

-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
+	      <<" residual "<< TrueResidual<< std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -144,7 +144,7 @@ public:
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
-	       <<" target resid "<<rsq[s]<<std::endl;
+	       <<" target resid^2 "<<rsq[s]<<std::endl;
      ps[s] = src;
    }
    // r and p for primary
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -79,14 +79,16 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public Imp
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);

    std::cout.precision(13);
-    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
-	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<std::endl;

    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;

+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" target " << eresid*eresid << " conv " <<conv
+	     <<std::endl;
+
    return conv;
  }
 };
@@ -457,7 +459,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
-    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
+    std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -465,7 +467,7 @@ until convergence

    Field& evec_k = evec[k];

-    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
+    _PolyOp(evec_k,w);    std::cout<<GridLogDebug << "PolyOp" <<std::endl;

    if(k>0) w -= lme[k-1] * evec[k-1];

@@ -480,18 +482,18 @@ until convergence
    lme[k] = beta;

    if ( (k>0) && ( (k % orth_period) == 0 )) {
-      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
+      std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
+      std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;

-    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;

-    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
+    std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations {
+template<class Field> class NormalEquations : public LinearFunction<Field>{
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -60,7 +60,7 @@ public:
  }     
 };

-template<class Field> class HPDSolver {
+template<class Field> class HPDSolver : public LinearFunction<Field> {
 private:
  LinearOperatorBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -78,13 +78,13 @@ public:
  void operator() (const Field &in, Field &out){
 
    _Guess(in,out);
-    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
+    _HermitianSolver(_Matrix,in,out);  //M out = in

  }     
 };


-template<class Field> class MdagMSolver {
+template<class Field> class MdagMSolver : public LinearFunction<Field> {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 50; 
+    const int _MAX_ITER_EST_ = 100; 

    for (int i=0;i<_MAX_ITER_EST_;i++) { 
      
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@@ -0,0 +1,381 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/Aggregates.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+inline RealD AggregatePowerLaw(RealD x)
+{
+  //  return std::pow(x,-4);
+  //  return std::pow(x,-3);
+  return std::pow(x,-5);
+}
+
+template<class Fobj,class CComplex,int nbasis>
+class Aggregation {
+public:
+  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  GridBase *CoarseGrid;
+  GridBase *FineGrid;
+  std::vector<Lattice<Fobj> > subspace;
+  int checkerboard;
+  int Checkerboard(void){return checkerboard;}
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
+    CoarseGrid(_CoarseGrid),
+    FineGrid(_FineGrid),
+    subspace(nbasis,_FineGrid),
+    checkerboard(_checkerboard)
+  {
+  };
+  
+  
+  void Orthogonalise(void){
+    CoarseScalar InnerProd(CoarseGrid); 
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  } 
+  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+    blockProject(CoarseVec,FineVec,subspace);
+  }
+  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+    FineVec.Checkerboard() = subspace[0].Checkerboard();
+    blockPromote(CoarseVec,FineVec,subspace);
+  }
+
+  virtual void CreateSubspaceRandom(GridParallelRNG  &RNG) {
+    int nn=nbasis;
+    RealD scale;
+    FineField noise(FineGrid);
+    for(int b=0;b<nn;b++){
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      subspace[b] = noise;
+    }
+  }
+  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
+  {
+
+    RealD scale;
+
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      for(int i=0;i<1;i++){
+
+	CG(hermop,noise,subspace[b]);
+
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
+	      <<ordermin<<" step "<<orderstep
+	      <<" lo"<<filterlo<<std::endl;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	autoView( y_v , y, AcceleratorWrite);
+	autoView( Tn_v , (*Tn), AcceleratorWrite);
+	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
+	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
+
+
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+
+      // Refine
+      Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
+      noise = Mn;
+      PowerLaw(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+
+      // normalise
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+    }
+
+  }
+
+  virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+					       int nn,
+					       double hi,
+					       int orderfilter
+					       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
+
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+      // Filter
+      Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+    }
+
+  }
+
+  virtual void CreateSubspaceMultishift(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+					double Lo,double tol,int maxit)
+  {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
+
+    // Filter
+    // [ 1/6(x+Lo)  - 1/2(x+2Lo) + 1/2(x+3Lo)  -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
+    //
+    // 1/(x+Lo)  - 1/(x+2 Lo)
+    double epsilon      = Lo/3;
+    std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
+    std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
+    std::vector<RealD> tols({tol,tol,tol,tol});
+    std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
+
+    MultiShiftFunction msf(4,0.0,95.0);
+    std::cout << "msf constructed "<<std::endl;
+    msf.poles=shifts;
+    msf.residues=alpha;
+    msf.tolerances=tols;
+    msf.norm=0.0;
+    msf.order=alpha.size();
+    ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
+    
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      MSCG(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+    }
+
+  }
+  virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
+			      double Lo,double tol,int maxit)
+  {
+    FineField tmp(FineGrid);
+    for(int b =0;b<nbasis;b++)
+    {
+      RealD MirsShift = Lo;
+      ConjugateGradient<FineField>  CGsloppy(tol,maxit,false);
+      ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,MirsShift);
+      CGsloppy(hermop,subspace[b],tmp);
+      subspace[b]=tmp;
+    }
+  }
+
+  
+  
+};
+NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/BatchedBlas.h
+++ b/Grid/algorithms/multigrid/BatchedBlas.h
@@ -31,17 +31,12 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
-#include <cublas_v2.h>
+#include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_SYCL
-#include <oneapi/mkl.hpp>
-#endif
-#if 0
-#define GRID_ONE_MKL
-#endif
-#ifdef GRID_ONE_MKL
-#include <oneapi/mkl.hpp>
+#error // need oneMKL version
 #endif
+
 ///////////////////////////////////////////////////////////////////////	  
 // Need to rearrange lattice data to be in the right format for a
 // batched multiply. Might as well make these static, dense packed
@@ -51,24 +46,18 @@ NAMESPACE_BEGIN(Grid);
  typedef hipblasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_CUDA
-  typedef cublasHandle_t gridblasHandle_t;
+  typedef cudablasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef int32_t gridblasHandle_t;
 #endif
-#ifdef GRID_ONE_MKL
-  typedef cl::sycl::queue *gridblasHandle_t;
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
  typedef int32_t gridblasHandle_t;
 #endif

-enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
-
 class GridBLAS {
 public:

-  
  static gridblasHandle_t gridblasHandle;
  static int            gridblasInit;
  
@@ -78,21 +67,13 @@ public:
 #ifdef GRID_CUDA
      std::cout << "cublasCreate"<<std::endl;
      cublasCreate(&gridblasHandle);
-      cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
 #endif
 #ifdef GRID_HIP
      std::cout << "hipblasCreate"<<std::endl;
      hipblasCreate(&gridblasHandle);
 #endif
 #ifdef GRID_SYCL
-      gridblasHandle = theGridAccelerator;
 #endif
-#ifdef GRID_ONE_MKL
-      cl::sycl::cpu_selector selector;
-      cl::sycl::device selectedDevice { selector };
-      gridblasHandle =new sycl::queue (selectedDevice);
-#endif
-      gridblasInit=1;
    }
  }
  
@@ -126,9 +107,35 @@ public:
 #ifdef GRID_SYCL
    accelerator_barrier();
 #endif
-#ifdef GRID_ONE_MKL
-    gridblasHandle->wait();
-#endif
+  }
+  void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
+  {
+    int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
+    int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
+    int32_t N_C = nbasis*nrhs*coarseVol*nstencil; 
+    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
+    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
+    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
+    ComplexD alpha(1.0);
+    ComplexD beta (1.0);
+    for(int i=0;i<10;i++){
+      RealD t0 = usecond();
+      for(int s=0;s<nstencil;s++){
+	gemmStridedBatched(nbasis,nrhs,nbasis,
+			   alpha,
+			   &A[0], // m x k 
+			   &B[0], // k x n
+			   beta, 
+			   &C[0], // m x n
+			   coarseVol);
+      }
+      synchronise();
+      RealD t1 = usecond();
+      RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
+      RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
+      std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+      std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+    }
  }

  void gemmBatched(int m,int n, int k,
@@ -137,102 +144,26 @@ public:
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
 		   deviceVector<ComplexD*> &Cmn)
-  {
-    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
-		m,n,k,
-		alpha,
-		Amk,
-		Bkn,
-		beta,
-		Cmn);
-  }
-  void gemmBatched(int m,int n, int k,
-		   ComplexF alpha,
-		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
-		   deviceVector<ComplexF*> &Bkn,
-		   ComplexF beta,
-		   deviceVector<ComplexF*> &Cmn)
-  {
-    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
-		m,n,k,
-		alpha,
-		Amk,
-		Bkn,
-		beta,
-		Cmn);
-  }
-  void gemmBatched(int m,int n, int k,
-		   RealD alpha,
-		   deviceVector<RealD*> &Amk,  // pointer list to matrices
-		   deviceVector<RealD*> &Bkn,
-		   RealD beta,
-		   deviceVector<RealD*> &Cmn)
-  {
-    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
-		m,n,k,
-		alpha,
-		Amk,
-		Bkn,
-		beta,
-		Cmn);
-  }
-  void gemmBatched(int m,int n, int k,
-		   RealF alpha,
-		   deviceVector<RealF*> &Amk,  // pointer list to matrices
-		   deviceVector<RealF*> &Bkn,
-		   RealF beta,
-		   deviceVector<RealF*> &Cmn)
-  {
-    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
-		m,n,k,
-		alpha,
-		Amk,
-		Bkn,
-		beta,
-		Cmn);
-  }
-
-  void gemmBatched(GridBLASOperation_t OpA,
-		   GridBLASOperation_t OpB,
-		   int m,int n, int k,
-		   ComplexD alpha,
-		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
-		   deviceVector<ComplexD*> &Bkn,
-		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
-
+    // Use C-row major storage, so transpose calls
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
-    if(OpA!=GridBLAS_OP_N)
-      lda = k;
-    if(OpB!=GridBLAS_OP_N)
-      ldb = n;
-    
    static deviceVector<ComplexD> alpha_p(1);
    static deviceVector<ComplexD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
    RealD t0=usecond();
-    //    std::cout << "ZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
-    hipblasOperation_t hOpA;
-    hipblasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasZgemmBatched(gridblasHandle,
-				   hOpA,
-				   hOpB,
+				   HIPBLAS_OP_N,
+				   HIPBLAS_OP_N,
 				   m,n,k,
 				   (hipblasDoubleComplex *) &alpha_p[0],
 				   (hipblasDoubleComplex **)&Amk[0], lda,
@@ -244,17 +175,9 @@ public:
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
-    cublasOperation_t hOpA;
-    cublasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasZgemmBatched(gridblasHandle,
-				  hOpA,
-				  hOpB,
+				  CUBLAS_OP_N,
+				  CUBLAS_OP_N,
 				  m,n,k,
 				  (cuDoubleComplex *) &alpha_p[0],
 				  (cuDoubleComplex **)&Amk[0], lda,
@@ -270,32 +193,26 @@ public:
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  ComplexD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
 	}
      }
    }
 #endif
-    //    synchronise();
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
-     //     std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }

-  void gemmBatched(GridBLASOperation_t OpA,
-		   GridBLASOperation_t OpB,
-		   int m,int n, int k,
+  void gemmBatched(int m,int n, int k,
 		   ComplexF alpha,
 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexF*> &Bkn,
@@ -304,35 +221,23 @@ public:
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-
+    // Use C-row major storage, so transpose calls
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
-    if(OpA!=GridBLAS_OP_N)
-      lda = k;
-    if(OpB!=GridBLAS_OP_N)
-      ldb = n;
    static deviceVector<ComplexF> alpha_p(1);
    static deviceVector<ComplexF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
    RealD t0=usecond();
-
+    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
-    hipblasOperation_t hOpA;
-    hipblasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasCgemmBatched(gridblasHandle,
-				   hOpA,
-				   hOpB,
+				   HIPBLAS_OP_N,
+				   HIPBLAS_OP_N,
 				   m,n,k,
 				   (hipblasComplex *) &alpha_p[0],
 				   (hipblasComplex **)&Amk[0], lda,
@@ -340,21 +245,13 @@ public:
 				   (hipblasComplex *) &beta_p[0],
 				   (hipblasComplex **)&Cmn[0], ldc,
 				   batchCount);
-
+    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
-    cublasOperation_t hOpA;
-    cublasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasCgemmBatched(gridblasHandle,
-				  hOpA,
-				  hOpB,
+				  CUBLAS_OP_N,
+				  CUBLAS_OP_N,
 				  m,n,k,
 				  (cuComplex *) &alpha_p[0],
 				  (cuComplex **)&Amk[0], lda,
@@ -369,19 +266,14 @@ public:
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    ComplexF alphaf(real(alpha),imag(alpha));
-    ComplexF betaf(real(beta),imag(beta));
    // Need a default/reference implementation
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
-	  ComplexF c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
+	  ComplexD c_mn(0.0);
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
 	}
      }
    }
@@ -389,15 +281,16 @@ public:
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  
  ///////////////////////////////////////////////////////////////////////////
  // Single precision real GEMM
  ///////////////////////////////////////////////////////////////////////////

-  void gemmBatched(GridBLASOperation_t OpA,
-		   GridBLASOperation_t OpB,
-		   int m,int n, int k,
+  void gemmBatched(int m,int n, int k,
 		   RealF alpha,
 		   deviceVector<RealF*> &Amk,  // pointer list to matrices
 		   deviceVector<RealF*> &Bkn,
@@ -406,35 +299,23 @@ public:
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-
+    // Use C-row major storage, so transpose calls
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
-    if(OpA!=GridBLAS_OP_N)
-      lda = k;
-    if(OpB!=GridBLAS_OP_N)
-      ldb = n;
    static deviceVector<RealF> alpha_p(1);
    static deviceVector<RealF> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
    RealD t0=usecond();
-
+    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
-    hipblasOperation_t hOpA;
-    hipblasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasSgemmBatched(gridblasHandle,
-				   hOpA,
-				   hOpB,
+				   HIPBLAS_OP_N,
+				   HIPBLAS_OP_N,
 				   m,n,k,
 				   (float *) &alpha_p[0],
 				   (float **)&Amk[0], lda,
@@ -445,17 +326,9 @@ public:
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
-    cublasOperation_t hOpA;
-    cublasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasSgemmBatched(gridblasHandle,
-				  hOpA,
-				  hOpB,
+				  CUBLAS_OP_N,
+				  CUBLAS_OP_N,
 				  m,n,k,
 				  (float *) &alpha_p[0],
 				  (float **)&Amk[0], lda,
@@ -470,17 +343,14 @@ public:
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
    // Need a default/reference implementation
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
 	}
      }
    }
@@ -488,6 +358,9 @@ public:
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  
  
@@ -495,9 +368,7 @@ public:
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////

-  void gemmBatched(GridBLASOperation_t OpA,
-		   GridBLASOperation_t OpB,
-		   int m,int n, int k,
+  void gemmBatched(int m,int n, int k,
 		   RealD alpha,
 		   deviceVector<RealD*> &Amk,  // pointer list to matrices
 		   deviceVector<RealD*> &Bkn,
@@ -506,33 +377,20 @@ public:
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
-
+    // Use C-row major storage, so transpose calls
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
-    if(OpA!=GridBLAS_OP_N)
-      lda = k;
-    if(OpB!=GridBLAS_OP_N)
-      ldb = n;
-    
    static deviceVector<RealD> alpha_p(1);
    static deviceVector<RealD> beta_p(1);
    // can prestore the 1 and the zero on device
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
    RealD t0=usecond();
-
+    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
-    hipblasOperation_t hOpA;
-    hipblasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
    auto err = hipblasDgemmBatched(gridblasHandle,
 				   HIPBLAS_OP_N,
 				   HIPBLAS_OP_N,
@@ -546,17 +404,9 @@ public:
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
-    cublasOperation_t hOpA;
-    cublasOperation_t hOpB;
-    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
-    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
-    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
-    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
-    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
-    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
    auto err = cublasDgemmBatched(gridblasHandle,
-				  hOpA,
-				  hOpB,
+				  CUBLAS_OP_N,
+				  CUBLAS_OP_N,
 				  m,n,k,
 				  (double *) &alpha_p[0],
 				  (double **)&Amk[0], lda,
@@ -587,17 +437,14 @@ public:
 #warning "oneMKL implementation not built "
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
    // Need a default/reference implementation
    for (int p = 0; p < batchCount; ++p) {
      for (int mm = 0; mm < m; ++mm) {
 	for (int nn = 0; nn < n; ++nn) {
 	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	  for (int kk = 0; kk < k, ++kk)
+	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
 	}
      }
    }
@@ -605,6 +452,9 @@ public:
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  

@@ -633,10 +483,9 @@ public:
    deviceVector<ComplexD> beta_p(1);
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
-
-    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
-    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
+    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
+    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
 #ifdef GRID_HIP
    auto err = hipblasZgemmStridedBatched(gridblasHandle,
 					  HIPBLAS_OP_N,
@@ -662,63 +511,24 @@ public:
 			      (cuDoubleComplex *) Cmn, ldc, sdc,
 			      batchCount);
 #endif
-#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
-    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-						oneapi::mkl::transpose::N,
-						oneapi::mkl::transpose::N,
-						m,n,k,
-						alpha,
-						(const ComplexD *)Amk,lda,sda,
-						(const ComplexD *)Bkn,ldb,sdb,
-						beta,
-						(ComplexD *)Cmn,ldc,sdc,
-						batchCount);
+#ifdef GRID_SYCL
+     #warning "oneMKL implementation not made "
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
     // Need a default/reference implementation
     for (int p = 0; p < batchCount; ++p) {
       for (int mm = 0; mm < m; ++mm) {
 	 for (int nn = 0; nn < n; ++nn) {
 	   ComplexD c_mn(0.0);
-	   for (int kk = 0; kk < k; ++kk)
+	   for (int kk = 0; kk < k, ++kk)
 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
+	   Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
 	 }
       }
     }
 #endif
  }

-  double benchmark(int M, int N, int K, int BATCH)
-  {
-    int32_t N_A = M*K*BATCH;
-    int32_t N_B = K*N*BATCH;
-    int32_t N_C = M*N*BATCH;
-    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
-    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
-    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
-    ComplexD alpha(1.0);
-    ComplexD beta (1.0);
-    RealD flops = 8.0*M*N*K*BATCH;
-    int ncall=10;
-    RealD t0 = usecond();
-    for(int i=0;i<ncall;i++){
-      gemmStridedBatched(M,N,K,
-			 alpha,
-			 &A[0], // m x k 
-			 &B[0], // k x n
-			 beta, 
-			 &C[0], // m x n
-			 BATCH);
-    }
-    synchronise();
-    RealD t1 = usecond();
-    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
-    flops = 8.0*M*N*K*BATCH*ncall;
-    flops = flops/(t1-t0)/1.e3;
-    return flops; // Returns gigaflops
-  }
-



--- a/Grid/algorithms/multigrid/CoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h
@@ -56,243 +56,6 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  blockSum(CoarseInner,fine_inner_msk);
 }

-
-class Geometry {
-public:
-  int npoint;
-  int base;
-  std::vector<int> directions   ;
-  std::vector<int> displacements;
-  std::vector<int> points_dagger;
-
-  Geometry(int _d)  {
-    
-    base = (_d==5) ? 1:0;
-
-    // make coarse grid stencil for 4d , not 5d
-    if ( _d==5 ) _d=4;
-
-    npoint = 2*_d+1;
-    directions.resize(npoint);
-    displacements.resize(npoint);
-    points_dagger.resize(npoint);
-    for(int d=0;d<_d;d++){
-      directions[d   ] = d+base;
-      directions[d+_d] = d+base;
-      displacements[d  ] = +1;
-      displacements[d+_d]= -1;
-      points_dagger[d   ] = d+_d;
-      points_dagger[d+_d] = d;
-    }
-    directions   [2*_d]=0;
-    displacements[2*_d]=0;
-    points_dagger[2*_d]=2*_d;
-  }
-
-  int point(int dir, int disp) {
-    assert(disp == -1 || disp == 0 || disp == 1);
-    assert(base+0 <= dir && dir < base+4);
-
-    // directions faster index = new indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  1  2  3  0  1  2  3  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  2  3  4  1  2  3  4  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-
-    // displacements faster index = old indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  0  1  1  2  2  3  3  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  1  2  2  3  3  4  4  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-
-    if(dir == 0 and disp == 0)
-      return 8;
-    else // New indexing
-      return (1 - disp) / 2 * 4 + dir - base;
-    // else // Old indexing
-    //   return (4 * (dir - base) + 1 - disp) / 2;
-  }
-};
-  
-template<class Fobj,class CComplex,int nbasis>
-class Aggregation   {
-public:
-  typedef iVector<CComplex,nbasis >             siteVector;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-
-  GridBase *CoarseGrid;
-  GridBase *FineGrid;
-  std::vector<Lattice<Fobj> > subspace;
-  int checkerboard;
-  int Checkerboard(void){return checkerboard;}
-  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-    CoarseGrid(_CoarseGrid),
-    FineGrid(_FineGrid),
-    subspace(nbasis,_FineGrid),
-    checkerboard(_checkerboard)
-  {
-  };
-  
-  void Orthogonalise(void){
-    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-  } 
-  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    blockProject(CoarseVec,FineVec,subspace);
-  }
-  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-    FineVec.Checkerboard() = subspace[0].Checkerboard();
-    blockPromote(CoarseVec,FineVec,subspace);
-  }
-
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-      
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<1;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
-  // and this is the best I found
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      b++;
-    }
-
-    // Generate a full sequence of Chebyshevs
-    {
-      lo=filterlo;
-      noise=Mn;
-
-      FineField T0(FineGrid); T0 = noise;  
-      FineField T1(FineGrid); 
-      FineField T2(FineGrid);
-      FineField y(FineGrid);
-      
-      FineField *Tnm = &T0;
-      FineField *Tn  = &T1;
-      FineField *Tnp = &T2;
-
-      // Tn=T1 = (xscale M + mscale)in
-      RealD xscale = 2.0/(hi-lo);
-      RealD mscale = -(hi+lo)/(hi-lo);
-      hermop.HermOp(T0,y);
-      T1=y*xscale+noise*mscale;
-
-      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
-	
-	hermop.HermOp(*Tn,y);
-
-	autoView( y_v , y, AcceleratorWrite);
-	autoView( Tn_v , (*Tn), AcceleratorWrite);
-	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
-	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
-	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
-	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
-	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
-        });
-
-	// Possible more fine grained control is needed than a linear sweep,
-	// but huge productivity gain if this is simple algorithm and not a tunable
-	int m =1;
-	if ( n>=ordermin ) m=n-ordermin;
-	if ( (m%orderstep)==0 ) { 
-	  Mn=*Tnp;
-	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
-	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-	  b++;
-	}
-
-	// Cycle pointers to avoid copies
-	FineField *swizzle = Tnm;
-	Tnm    =Tn;
-	Tn     =Tnp;
-	Tnp    =swizzle;
-	  
-      }
-    }
-    assert(b==nn);
-  }
-
-};
-
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -0,0 +1,467 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
+
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// Fine Object == (per site) type of fine field
+// nbasis      == number of deflation vectors
+template<class Fobj,class CComplex,int nbasis>
+class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+public:
+
+  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef iMatrix<CComplex,nbasis >           siteMatrix;
+  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef iVector<CComplex,nbasis >  Cvec;
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+  typedef Lattice<CComplex >    FineComplexField;
+  typedef CoarseVector Field;
+  ////////////////////
+  // Data members
+  ////////////////////
+  int hermitian;
+  GridBase      *       _FineGrid; 
+  GridCartesian *       _CoarseGrid; 
+  NonLocalStencilGeometry &geom;
+  PaddedCell Cell;
+  GeneralLocalStencil Stencil;
+  
+  std::vector<CoarseMatrix> _A;
+  std::vector<CoarseMatrix> _Adag;
+  std::vector<CoarseVector> MultTemporaries;
+
+  ///////////////////////
+  // Interface
+  ///////////////////////
+  GridBase      * Grid(void)           { return _CoarseGrid; };   // this is all the linalg routines need to know
+  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
+  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
+
+  void ShiftMatrix(RealD shift)
+  {
+    int Nd=_FineGrid->Nd(); 
+    Coordinate zero_shift(Nd,0);
+    for(int p=0;p<geom.npoint;p++){
+      if ( zero_shift==geom.shifts[p] ) {
+	_A[p] = _A[p]+shift;
+	_Adag[p] = _Adag[p]+shift;
+      }
+    }    
+  }
+  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
+  {
+    int nfound=0;
+    std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
+ 	// Search for the same relative shift
+	// Avoids brutal handling of Grid pointers
+	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
+	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
+	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
+	  nfound++;
+	}
+      }
+    }
+    assert(nfound==geom.npoint);
+    ExchangeCoarseLinks();
+  }
+  
+  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
+    : geom(_geom),
+      _FineGrid(FineGrid),
+      _CoarseGrid(CoarseGrid),
+      hermitian(1),
+      Cell(_geom.Depth(),_CoarseGrid),
+      Stencil(Cell.grids.back(),geom.shifts)
+  {
+    {
+      int npoint = _geom.npoint;
+    }
+    _A.resize(geom.npoint,CoarseGrid);
+    _Adag.resize(geom.npoint,CoarseGrid);
+  }
+  void M (const CoarseVector &in, CoarseVector &out)
+  {
+    Mult(_A,in,out);
+  }
+  void Mdag (const CoarseVector &in, CoarseVector &out)
+  {
+    if ( hermitian ) M(in,out);
+    else Mult(_Adag,in,out);
+  }
+  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
+  {
+    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
+    RealD tmult2=0;
+
+    ttot=-usecond();
+    conformable(CoarseGrid(),in.Grid());
+    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
+    CoarseVector tin=in;
+
+    texch-=usecond();
+    CoarseVector pin = Cell.ExchangePeriodic(tin);
+    texch+=usecond();
+
+    CoarseVector pout(pin.Grid());
+
+    int npoint = geom.npoint;
+    typedef LatticeView<Cobj> Aview;
+    typedef LatticeView<Cvec> Vview;
+      
+    const int Nsimd = CComplex::Nsimd();
+    
+    int64_t osites=pin.Grid()->oSites();
+
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+                + 2.0*osites*sizeof(siteVector)*npoint;
+      
+    {
+      tviews-=usecond();
+      autoView( in_v , pin, AcceleratorRead);
+      autoView( out_v , pout, AcceleratorWriteDiscard);
+      autoView( Stencil_v  , Stencil, AcceleratorRead);
+      tviews+=usecond();
+
+      // Static and prereserve to keep UVM region live and not resized across multiple calls
+      ttemps-=usecond();
+      MultTemporaries.resize(npoint,pin.Grid());       
+      ttemps+=usecond();
+      std::vector<Aview> AcceleratorViewContainer_h;
+      std::vector<Vview> AcceleratorVecViewContainer_h; 
+
+      tviews-=usecond();
+      for(int p=0;p<npoint;p++) {
+	AcceleratorViewContainer_h.push_back(      A[p].View(AcceleratorRead));
+	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
+      }
+      tviews+=usecond();
+
+      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
+      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
+      
+      auto Aview_p = &AcceleratorViewContainer[0];
+      auto Vview_p = &AcceleratorVecViewContainer[0];
+      tcopy-=usecond();
+      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
+      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
+      tcopy+=usecond();
+
+      tmult-=usecond();
+      accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
+	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+	  int32_t ss   = spb/(nbasis*npoint);
+	  int32_t bp   = spb%(nbasis*npoint);
+	  int32_t point= bp/nbasis;
+	  int32_t b    = bp%nbasis;
+	  auto SE  = Stencil_v.GetEntry(point,ss);
+	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
+	  auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
+	  for(int bb=1;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
+	  }
+	  coalescedWrite(Vview_p[point][ss](b),res);
+      });
+      tmult2-=usecond();
+      accelerator_for(sb, osites*nbasis, Nsimd, {
+	  int ss = sb/nbasis;
+	  int b  = sb%nbasis;
+	  auto res = coalescedRead(Vview_p[0][ss](b));
+	  for(int point=1;point<npoint;point++){
+	    res = res + coalescedRead(Vview_p[point][ss](b));
+	  }
+	  coalescedWrite(out_v[ss](b),res);
+      });
+      tmult2+=usecond();
+      tmult+=usecond();
+      for(int p=0;p<npoint;p++) {
+	AcceleratorViewContainer_h[p].ViewClose();
+	AcceleratorVecViewContainer_h[p].ViewClose();
+      }
+    }
+
+    text-=usecond();
+    out = Cell.Extract(pout);
+    text+=usecond();
+    ttot+=usecond();
+    
+    std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<" of which mult2  "<<tmult2<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    //    std::cout << GridLogPerformance<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+
+  };
+  
+  void PopulateAdag(void)
+  {
+    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
+      Coordinate bcoor;
+      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
+      
+      for(int p=0;p<geom.npoint;p++){
+	Coordinate scoor = bcoor;
+	for(int mu=0;mu<bcoor.size();mu++){
+	  int L = CoarseGrid()->GlobalDimensions()[mu];
+	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
+	}
+	// Flip to poke/peekLocalSite and not too bad
+	auto link = peekSite(_A[p],scoor);
+	int pp = geom.Reverse(p);
+	pokeSite(adj(link),_Adag[pp],bcoor);
+      }
+    }
+  }
+  /////////////////////////////////////////////////////////////
+  // 
+  // A) Only reduced flops option is to use a padded cell of depth 4
+  // and apply MpcDagMpc in the padded cell.
+  //
+  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
+  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
+  // Cost is 81x more, same as stencil size.
+  //
+  // But: can eliminate comms and do as local dirichlet.
+  //
+  // Local exchange gauge field once.
+  // Apply to all vectors, local only computation.
+  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
+  //
+  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
+  //                     pad by 2, apply Doe
+  //                     pad by 3, apply Deo
+  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
+  //
+  // => almost factor of 10 in setup cost, excluding data rearrangement
+  //
+  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
+  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
+  /////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////
+    // BFM HDCG style approach: Solve a system of equations to get Aij
+    //////////////////////////////////////////////////////////
+    /*
+     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
+     *
+     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
+     *                                                 =  \sum_ball e^{iqk.delta} A_ji
+     *
+     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
+     *
+     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+     */
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
+    GridBase *grid = FineGrid();
+
+    RealD tproj=0.0;
+    RealD teigen=0.0;
+    RealD tmat=0.0;
+    RealD tphase=0.0;
+    RealD tphaseBZ=0.0;
+    RealD tinv=0.0;
+
+    /////////////////////////////////////////////////////////////
+    // Orthogonalise the subblocks over the basis
+    /////////////////////////////////////////////////////////////
+    CoarseScalar InnerProd(CoarseGrid()); 
+    blockOrthogonalise(InnerProd,Subspace.subspace);
+
+    const int npoint = geom.npoint;
+      
+    Coordinate clatt = CoarseGrid()->GlobalDimensions();
+    int Nd = CoarseGrid()->Nd();
+
+      /*
+       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
+       *     Matrix index i is mapped to this shift via 
+       *               geom.shifts[i]
+       *
+       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
+       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
+       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
+       *       = M_{kl} A_ji^{b.b+l}
+       *
+       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
+       *  
+       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+       *
+       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
+       */
+    teigen-=usecond();
+    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
+    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
+    ComplexD ci(0.0,1.0);
+    for(int k=0;k<npoint;k++){ // Loop over momenta
+
+      for(int l=0;l<npoint;l++){ // Loop over nbr relative
+	ComplexD phase(0.0,0.0);
+	for(int mu=0;mu<Nd;mu++){
+	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
+	}
+	phase=exp(phase*ci);
+	Mkl(k,l) = phase;
+      }
+    }
+    invMkl = Mkl.inverse();
+    teigen+=usecond();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Now compute the matrix elements of linop between the orthonormal
+    // set of vectors.
+    ///////////////////////////////////////////////////////////////////////
+    FineField phaV(grid); // Phased block basis vector
+    FineField MphaV(grid);// Matrix applied
+    std::vector<FineComplexField> phaF(npoint,grid);
+    std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
+    
+    CoarseVector coarseInner(CoarseGrid());
+    
+    typedef typename CComplex::scalar_type SComplex;
+    FineComplexField one(grid); one=SComplex(1.0);
+    FineComplexField zz(grid); zz = Zero();
+    tphase=-usecond();
+    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+      /////////////////////////////////////////////////////
+      // Stick a phase on every block
+      /////////////////////////////////////////////////////
+      CoarseComplexField coor(CoarseGrid());
+      pha[p]=Zero();
+      for(int mu=0;mu<Nd;mu++){
+	LatticeCoordinate(coor,mu);
+	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
+      }
+      pha[p]  =exp(pha[p]*ci);
+
+      blockZAXPY(phaF[p],pha[p],one,zz);
+      
+    }
+    tphase+=usecond();
+    
+    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
+    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
+    for(int i=0;i<nbasis;i++){// Loop over basis vectors
+      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
+      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+	tphaseBZ-=usecond();
+	phaV = phaF[p]*Subspace.subspace[i];
+	tphaseBZ+=usecond();
+
+	/////////////////////////////////////////////////////////////////////
+	// Multiple phased subspace vector by matrix and project to subspace
+	// Remove local bulk phase to leave relative phases
+	/////////////////////////////////////////////////////////////////////
+	tmat-=usecond();
+	linop.Op(phaV,MphaV);
+	tmat+=usecond();
+
+	tproj-=usecond();
+	blockProjectFast(coarseInner,MphaV,Subspace.subspace);
+	coarseInner = conjugate(pha[p]) * coarseInner;
+
+	ComputeProj[p] = coarseInner;
+	tproj+=usecond();
+
+      }
+
+      tinv-=usecond();
+      for(int k=0;k<npoint;k++){
+	FT[k] = Zero();
+	for(int l=0;l<npoint;l++){
+	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
+	}
+      
+	int osites=CoarseGrid()->oSites();
+	autoView( A_v  , _A[k], AcceleratorWrite);
+	autoView( FT_v  , FT[k], AcceleratorRead);
+	accelerator_for(sss, osites, 1, {
+	    for(int j=0;j<nbasis;j++){
+	      A_v[sss](i,j) = FT_v[sss](j);
+	    }
+        });
+      }
+      tinv+=usecond();
+    }
+
+    // Only needed if nonhermitian
+    if ( ! hermitian ) {
+      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
+      PopulateAdag();
+    }
+
+    // Need to write something to populate Adag from A
+    ExchangeCoarseLinks();
+    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
+  }
+  void ExchangeCoarseLinks(void){
+    for(int p=0;p<geom.npoint;p++){
+      _A[p] = Cell.ExchangePeriodic(_A[p]);
+      _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
+    }
+  }
+  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
+};
+
+
+  
+NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -0,0 +1,402 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/algorithms/multigrid/BatchedBlas.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+// Move this to accelerator.h
+// Also give a copy device.
+// Rename acceleratorPut
+// Rename acceleratorGet
+template<class T> void deviceSet(T& dev,T&host)
+{
+  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+}
+template<class T> T deviceGet(T& dev)
+{
+  T host;
+  acceleratorCopyFromDevice(&dev,&host,sizeof(T));
+  return host;
+}
+
+// Fine Object == (per site) type of fine field
+// nbasis      == number of deflation vectors
+template<class Fobj,class CComplex,int nbasis>
+class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+public:
+  typedef typename CComplex::scalar_object SComplex;
+  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
+  typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
+
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef iMatrix<CComplex,nbasis >           siteMatrix;
+  typedef iVector<SComplex,nbasis >           calcVector;
+  typedef iMatrix<SComplex,nbasis >           calcMatrix;
+  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef iVector<CComplex,nbasis >  Cvec;
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+  typedef CoarseVector Field;
+
+  ////////////////////
+  // Data members
+  ////////////////////
+  GridCartesian *       _CoarseGridMulti; 
+  GridCartesian *       _CoarseGrid;
+  GeneralCoarseOp &     _Op;
+  NonLocalStencilGeometry geom;
+  PaddedCell Cell;
+  GeneralLocalStencil Stencil;
+
+  deviceVector<calcVector> BLAS_B;
+  deviceVector<calcVector> BLAS_C;
+  std::vector<deviceVector<calcMatrix> > BLAS_A;
+
+  std::vector<deviceVector<ComplexD *> > BLAS_AP;
+  std::vector<deviceVector<ComplexD *> > BLAS_BP;
+  deviceVector<ComplexD *>               BLAS_CP;
+
+  ///////////////////////
+  // Interface
+  ///////////////////////
+  GridBase      * Grid(void)           { return _CoarseGridMulti; };   // this is all the linalg routines need to know
+  GridCartesian * CoarseGrid(void)     { return _CoarseGridMulti; };   // this is all the linalg routines need to know
+
+  MultiGeneralCoarsenedMatrix(GeneralCoarseOp & Op,GridCartesian *CoarseGridMulti) :
+    _Op(Op),
+    _CoarseGrid(Op.CoarseGrid()),
+    _CoarseGridMulti(CoarseGridMulti),
+    geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
+    Cell(Op.geom.Depth(),_CoarseGridMulti),
+    Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
+  {
+    int32_t padded_sites   = _Op._A[0].Grid()->lSites();
+    int32_t unpadded_sites = _CoarseGrid->lSites();
+    
+    int32_t nrhs  = CoarseGridMulti->FullDimensions()[0];  // # RHS
+    int32_t orhs  = nrhs/CComplex::Nsimd();
+
+    /////////////////////////////////////////////////
+    // Device data vector storage
+    /////////////////////////////////////////////////
+    BLAS_A.resize(geom.npoint);
+    for(int p=0;p<geom.npoint;p++){
+      BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
+    }
+    BLAS_B.resize(nrhs *padded_sites);   // includes ghost zone
+    BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
+
+    BLAS_AP.resize(geom.npoint);
+    BLAS_BP.resize(geom.npoint);
+    for(int p=0;p<geom.npoint;p++){
+      BLAS_AP[p].resize(unpadded_sites);
+      BLAS_BP[p].resize(unpadded_sites);
+    }
+    BLAS_CP.resize(unpadded_sites);
+
+    /////////////////////////////////////////////////
+    // Pointers to data
+    /////////////////////////////////////////////////
+
+    // Site identity mapping for A, C
+    for(int p=0;p<geom.npoint;p++){
+      for(int ss=0;ss<unpadded_sites;ss++){
+	ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
+	//ComplexD *ptr = (ComplexD *)&BLAS_A[p][0]; std::cout << " A ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_A[p].size()<<std::endl;
+	deviceSet(BLAS_AP[p][ss],ptr);
+      }
+    }
+    for(int ss=0;ss<unpadded_sites;ss++){
+      ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
+      //ComplexD *ptr = (ComplexD *)&BLAS_C[0];  std::cout << " C ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_C.size()<<std::endl;
+      deviceSet(BLAS_CP[ss],ptr);
+    }
+
+    /////////////////////////////////////////////////
+    // Neighbour table is more complicated
+    /////////////////////////////////////////////////
+    int32_t j=0; // Interior point counter (unpadded)
+    for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
+      int ghost_zone=0;
+      for(int32_t point = 0 ; point < geom.npoint; point++){
+	int i=s*orhs*geom.npoint+point;
+	if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
+	  ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
+	}
+      }
+      //      GeneralStencilEntryReordered tmp;
+      if( ghost_zone==0) {
+	for(int32_t point = 0 ; point < geom.npoint; point++){
+	  int i=s*orhs*geom.npoint+point;
+ 	  int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
+	  //	  std::cout << " B ptr "<< nbr<<"/"<<BLAS_B.size()<<std::endl;
+	  assert(nbr<BLAS_B.size());
+	  ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
+	  //	  ComplexD * ptr = (ComplexD *)&BLAS_B[0];
+	  //	  std::cout << " B ptr unpadded "<<std::hex<<ptr<<std::dec<<" "<<s<<"/"<<padded_sites<<std::endl;
+	  //	  std::cout << " B ptr   padded "<<std::hex<<ptr<<std::dec<<" "<<j<<"/"<<unpadded_sites<<std::endl;
+	  deviceSet(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
+	  //	  auto tmp = deviceGet(*BLAS_BP[point][j]);  // debug trigger SEGV if bad ptr
+	}
+	j++;
+      }
+    }
+    assert(j==unpadded_sites);
+    CopyMatrix();
+  }
+  template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
+  {
+#if 0
+    std::vector<typename vobj::scalar_object> tmp;
+    unvectorizeToLexOrdArray(tmp,from);
+    assert(tmp.size()==from.Grid()->lSites());
+    assert(tmp.size()==to.size());
+    to.resize(tmp.size());
+    acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
+#else
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  GridBase *Fg = from.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  int nd = Fg->_ndimension;
+
+  to.resize(Fg->lSites());
+
+  Coordinate LocalLatt = Fg->LocalDimensions();
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+
+  autoView(from_v,from,AcceleratorRead);
+  auto to_v = &to[0];
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  accelerator_for(idx,nsite,1,{
+      
+      Coordinate from_coor, base;
+      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
+      for(int i=0;i<nd;i++){
+	from_coor[i] = base[i];
+      }
+      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      scalar_type* to = (scalar_type *)&to_v[idx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	to[w] = stmp;
+      }
+    });
+#endif
+  }    
+  template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
+  {
+#if 0
+    std::vector<typename vobj::scalar_object> tmp;
+    tmp.resize(in.size());
+    //    std::cout << "BLAStoGrid volume " <<tmp.size()<<" "<< grid.Grid()->lSites()<<std::endl;
+    assert(in.size()==grid.Grid()->lSites());
+    acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
+    vectorizeFromLexOrdArray(tmp,grid);
+#else
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  GridBase *Tg = grid.Grid();
+  assert(!Tg->_isCheckerBoarded);
+  int nd = Tg->_ndimension;
+  
+  assert(in.size()==Tg->lSites());
+
+  Coordinate LocalLatt = Tg->LocalDimensions();
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;
+
+  autoView(to_v,grid,AcceleratorWrite);
+  auto from_v = &in[0];
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  accelerator_for(idx,nsite,1,{
+      
+      Coordinate to_coor, base;
+      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
+      for(int i=0;i<nd;i++){
+	to_coor[i] = base[i];
+      }
+      int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
+
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      scalar_type* from = (scalar_type *)&from_v[idx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp=from[w];
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+#endif
+  }
+  void CopyMatrix (void)
+  {
+    // Clone "A" to be lexicographic in the physics coords
+    // Use unvectorisetolexordarray
+    // Copy to device
+    for(int p=0;p<geom.npoint;p++){
+      //Unpadded
+      auto Aup = _Op.Cell.Extract(_Op._A[p]);
+      //      Coordinate coor({0,0,0,0,0});
+      //      auto sval = peekSite(Aup,coor);
+      //      std::cout << "CopyMatrix: p "<<p<<" Aup[0] :"<<sval<<std::endl;
+      //      sval = peekSite(_Op._A[p],coor);
+      //      std::cout << "CopyMatrix: p "<<p<<" _Op._Ap[0] :"<<sval<<std::endl;
+      GridtoBLAS(Aup,BLAS_A[p]);
+      //      std::cout << "Copy Matrix p "<<p<<" "<< deviceGet(BLAS_A[p][0])<<std::endl;
+    }
+  }
+  void Mdag(const CoarseVector &in, CoarseVector &out)
+  {
+    this->M(in,out);
+  }
+  void M (const CoarseVector &in, CoarseVector &out)
+  {
+    std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
+    conformable(CoarseGrid(),in.Grid());
+    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
+
+    RealD t_tot;
+    RealD t_exch;
+    RealD t_GtoB;
+    RealD t_BtoG;
+    RealD t_mult;
+
+    t_tot=-usecond();
+    CoarseVector tin=in;
+    t_exch=-usecond();
+    CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
+    t_exch+=usecond();
+
+    CoarseVector pout(pin.Grid());
+
+    int npoint = geom.npoint;
+    typedef calcMatrix* Aview;
+    typedef LatticeView<Cvec> Vview;
+      
+    const int Nsimd = CComplex::Nsimd();
+
+    RealD flops,bytes;
+    int64_t osites=in.Grid()->oSites(); // unpadded
+    int64_t unpadded_vol = _CoarseGrid->lSites();
+    
+    flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+          + 2.0*osites*sizeof(siteVector)*npoint;
+    
+    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
+    assert(nrhs>=1);
+
+    std::cout << GridLogMessage << "New Mrhs GridtoBLAS in sizes "<<in.Grid()->lSites()<<" "<<pin.Grid()->lSites()<<std::endl;
+    t_GtoB=-usecond();
+    GridtoBLAS(pin,BLAS_B);
+    //    out = Zero();
+    //    GridtoBLAS(out,BLAS_C);
+    t_GtoB+=usecond();
+
+    GridBLAS BLAS;
+
+    t_mult=-usecond();
+    for(int p=0;p<geom.npoint;p++){
+      RealD c = 1.0;
+      if (p==0) c = 0.0;
+      ComplexD beta(c);
+      //      std::cout << GridLogMessage << "New Mrhs coarse gemmBatched "<<p<<std::endl;
+      BLAS.gemmBatched(nbasis,nrhs,nbasis,
+		       ComplexD(1.0),
+		       BLAS_AP[p], 
+		       BLAS_BP[p], 
+		       ComplexD(c), 
+		       BLAS_CP);
+    }
+    BLAS.synchronise();
+    t_mult+=usecond();
+    //    std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
+    t_BtoG=-usecond();
+    BLAStoGrid(out,BLAS_C);
+    t_BtoG+=usecond();
+    t_tot+=usecond();
+    //    auto check =deviceGet(BLAS_C[0]);
+    //      std::cout << "C[0] "<<check<<std::endl;
+    //    Coordinate coor({0,0,0,0,0,0});
+    //    peekLocalSite(check,out,coor);
+    //    std::cout << "C[0] "<< check<<std::endl;
+    std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult GtoB  "<<t_GtoB<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult BtoG  "<<t_BtoG<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult tot  "<<t_tot<<" us"<<std::endl;
+    std::cout << GridLogMessage<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+  };
+  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
+
+};
+  
+NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/Geometry.h
+++ b/Grid/algorithms/multigrid/Geometry.h
@@ -0,0 +1,238 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+
+/////////////////////////////////////////////////////////////////
+// Geometry class in cartesian case
+/////////////////////////////////////////////////////////////////
+
+class Geometry {
+public:
+  int npoint;
+  int base;
+  std::vector<int> directions   ;
+  std::vector<int> displacements;
+  std::vector<int> points_dagger;
+
+  Geometry(int _d)  {
+    
+    base = (_d==5) ? 1:0;
+
+    // make coarse grid stencil for 4d , not 5d
+    if ( _d==5 ) _d=4;
+
+    npoint = 2*_d+1;
+    directions.resize(npoint);
+    displacements.resize(npoint);
+    points_dagger.resize(npoint);
+    for(int d=0;d<_d;d++){
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
+      points_dagger[d   ] = d+_d;
+      points_dagger[d+_d] = d;
+    }
+    directions   [2*_d]=0;
+    displacements[2*_d]=0;
+    points_dagger[2*_d]=2*_d;
+  }
+
+  int point(int dir, int disp) {
+    assert(disp == -1 || disp == 0 || disp == 1);
+    assert(base+0 <= dir && dir < base+4);
+
+    // directions faster index = new indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  1  2  3  0  1  2  3  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  2  3  4  1  2  3  4  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+
+    // displacements faster index = old indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  0  1  1  2  2  3  3  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  1  2  2  3  3  4  4  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+
+    if(dir == 0 and disp == 0)
+      return 8;
+    else // New indexing
+      return (1 - disp) / 2 * 4 + dir - base;
+    // else // Old indexing
+    //   return (4 * (dir - base) + 1 - disp) / 2;
+  }
+};
+
+/////////////////////////////////////////////////////////////////
+// Less local equivalent of Geometry class in cartesian case
+/////////////////////////////////////////////////////////////////
+class NonLocalStencilGeometry {
+public:
+  //  int depth;
+  int skip;
+  int hops;
+  int npoint;
+  std::vector<Coordinate> shifts;
+  Coordinate stencil_size;
+  Coordinate stencil_lo;
+  Coordinate stencil_hi;
+  GridCartesian *grid;
+  GridCartesian *Grid() {return grid;};
+  int Depth(void){return 1;};   // Ghost zone depth
+  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
+  int DimSkip(void){return skip;};
+
+  virtual ~NonLocalStencilGeometry() {};
+
+  int  Reverse(int point)
+  {
+    int Nd = Grid()->Nd();
+    Coordinate shft = shifts[point];
+    Coordinate rev(Nd);
+    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
+    for(int p=0;p<npoint;p++){
+      if(rev==shifts[p]){
+	return p;
+      }
+    }
+    assert(0);
+    return -1;
+  }
+  void BuildShifts(void)
+  {
+    this->shifts.resize(0);
+    int Nd = this->grid->Nd();
+
+    int dd = this->DimSkip();
+    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
+    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
+    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
+    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
+      Coordinate sft(Nd,0);
+      sft[dd+0] = s0;
+      sft[dd+1] = s1;
+      sft[dd+2] = s2;
+      sft[dd+3] = s3;
+      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
+      if(nhops<=this->hops) this->shifts.push_back(sft);
+    }}}}
+    this->npoint = this->shifts.size();
+    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
+  }
+  
+  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
+  {
+    Coordinate latt = grid->GlobalDimensions();
+    stencil_size.resize(grid->Nd());
+    stencil_lo.resize(grid->Nd());
+    stencil_hi.resize(grid->Nd());
+    for(int d=0;d<grid->Nd();d++){
+     if ( latt[d] == 1 ) {
+      stencil_lo[d] = 0;
+      stencil_hi[d] = 0;
+      stencil_size[d]= 1;
+     } else if ( latt[d] == 2 ) {
+      stencil_lo[d] = -1;
+      stencil_hi[d] = 0;
+      stencil_size[d]= 2;
+     } else if ( latt[d] > 2 ) {
+       stencil_lo[d] = -1;
+       stencil_hi[d] =  1;
+       stencil_size[d]= 3;
+     }
+    }
+    this->BuildShifts();
+  };
+
+};
+
+// Need to worry about red-black now
+class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
+public:
+  virtual int DerivedDimSkip(void) { return 0;};
+  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
+  virtual ~NonLocalStencilGeometry4D() {};
+};
+class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
+public:
+  virtual int DerivedDimSkip(void) { return 1; }; 
+  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1)  { };
+  virtual ~NonLocalStencilGeometry5D() {};
+};
+/*
+ * Bunch of different options classes
+ */
+class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
+public:
+  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
+  {
+  };
+};
+class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
+public:
+  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
+  {
+  };
+};
+class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
+public:
+  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
+  {
+  };
+};
+class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
+public:
+  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
+  {
+  };
+};
+class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
+public:
+  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
+  {
+  };
+};
+class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
+public:
+  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
+  {
+  };
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/MultiGrid.h
+++ b/Grid/algorithms/multigrid/MultiGrid.h
@@ -1,8 +1,8 @@
-/*************************************************************************************
+    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid

-    Source file: BatchedBlas.h
+    Source file: Grid/algorithms/multigrid/MultiGrid.h

    Copyright (C) 2023

@@ -23,12 +23,13 @@ Author: Peter Boyle <pboyle@bnl.gov>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/algorithms/blas/BatchedBlas.h>
-NAMESPACE_BEGIN(Grid);
-gridblasHandle_t GridBLAS::gridblasHandle;
-int              GridBLAS::gridblasInit;
-NAMESPACE_END(Grid);
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once

+#include <Grid/algorithms/multigrid/Aggregates.h>
+#include <Grid/algorithms/multigrid/Geometry.h>
+#include <Grid/algorithms/multigrid/BatchedBlas.h>
+#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
+#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -175,9 +175,56 @@ template<class T> using cshiftAllocator = std::allocator<T>;

 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using commVector    = std::vector<T,devAllocator<T> >;
 template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
-template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
+template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
+
+/*
+template<class T> class vecView
+{
+ protected:
+  T * data;
+  uint64_t size;
+  ViewMode mode;
+  void * cpu_ptr;
+ public:
+  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
+  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
+  {
+    cpu_ptr = &refer_to_me[0];
+    size = refer_to_me.size();
+    mode = _mode;
+    data =(T *) MemoryManager::ViewOpen(cpu_ptr,
+					size*sizeof(T),
+					mode,
+					AdviseDefault);
+  }
+  void ViewClose(void)
+  { // Inform the manager
+    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
+  }
+};
+
+template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
+{
+  vecView<T> ret(vec,_mode); // does the open
+  return ret;                // must be closed
+}
+
+// Little autoscope assister
+template<class View> 
+class VectorViewCloser
+{
+  View v;  // Take a copy of view and call view close when I go out of scope automatically
+ public:
+  VectorViewCloser(View &_v) : v(_v) {};
+  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
+};
+
+#define autoVecView(v_v,v,mode)					\
+  auto v_v = VectorView(v,mode);				\
+  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -209,9 +209,9 @@ private:
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
-  static void NotifyDeletion(void * CpuPtr);

 public:
+  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -8,7 +8,7 @@ NAMESPACE_BEGIN(Grid);
 static char print_buffer [ MAXLINE ];

 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
 //#define dprintf(...) 


@@ -111,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -141,7 +141,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@@ -155,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@@ -169,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -184,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int _fsites;                  // _isites*_osites = product(dimensions).
-  int _gsites;
+  int64_t _fsites;                  // _isites*_osites = product(dimensions).
+  int64_t _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@@ -183,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};

  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@@ -214,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@@ -222,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -138,6 +138,14 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
+  void CommsComplete(std::vector<CommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int dest,
+			   void *recv,
+			   int from,
+			   int bytes,int dir);
+  
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -306,6 +306,44 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
+
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes,int dir)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+
+  int tag;
+
+  tag= dir+from*32;
+  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
+  assert(ierr==0);
+  list.push_back(rrq);
+  
+  tag= dir+_processor*32;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
+  assert(ierr==0);
+  list.push_back(xrq);
+}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+
+  if (nreq==0) return;
+
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
@@ -348,7 +386,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
  return offbytes;
 }

-#undef NVLINK_GET // Define to use get instead of put DMA
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@@ -381,15 +418,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
-#ifdef NVLINK_GET
-      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-#endif
  }
  
  if (dox) {
-    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -397,12 +428,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
-#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
-      
    }
  }

@@ -412,8 +440,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 {
  int nreq=list.size();

-  acceleratorCopySynchronise();
-
  if (nreq==0) return;

  std::vector<MPI_Status> status(nreq);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -91,6 +91,17 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes,int dir)
+{
+  assert(0);
+}
+
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -40,9 +40,6 @@ int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;

 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
-#ifndef ACCELERATOR_AWARE_MPI
-void * GlobalSharedMemory::HostCommBuf;
-#endif

 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
@@ -69,26 +66,6 @@ void GlobalSharedMemory::SharedMemoryFree(void)
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
-#ifndef ACCELERATOR_AWARE_MPI
-void *SharedMemory::HostBufferMalloc(size_t bytes){
-  void *ptr = (void *)host_heap_top;
-  host_heap_top  += bytes;
-  host_heap_bytes+= bytes;
-  if (host_heap_bytes >= host_heap_size) {
-    std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
-    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
-    std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
-    std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
-    assert(host_heap_bytes<host_heap_size);
-  }
-  return ptr;
-}
-void SharedMemory::HostBufferFreeAll(void) { 
-  host_heap_top  =(size_t)HostCommBuf;
-  host_heap_bytes=0;
-}
-#endif
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -75,9 +75,7 @@ public:
  static int           Hugepages;

  static std::vector<void *> WorldShmCommBufs;
-#ifndef ACCELERATOR_AWARE_MPI
-  static void *HostCommBuf;
-#endif
+
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
@@ -122,13 +120,6 @@ private:
  size_t heap_bytes;
  size_t heap_size;

-#ifndef ACCELERATOR_AWARE_MPI
-  size_t host_heap_top;  // set in free all
-  size_t host_heap_bytes;// set in free all
-  void *HostCommBuf;     // set in SetCommunicator
-  size_t host_heap_size; // set in SetCommunicator
-#endif
-  
 protected:

  Grid_MPI_Comm    ShmComm; // for barriers
@@ -160,10 +151,7 @@ public:
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
-#ifndef ACCELERATOR_AWARE_MPI
-  void *HostBufferMalloc(size_t bytes);
-  void HostBufferFreeAll(void);
-#endif  
+  
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -39,11 +39,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #include <hip/hip_runtime_api.h>
 #endif
 #ifdef GRID_SYCL
-#ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
-#define SHM_SOCKETS
-#endif 
 #include <syscall.h>
+#define SHM_SOCKETS 
 #endif

 #include <sys/socket.h>
@@ -514,6 +512,46 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
 #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
+
+//if defined(GRID_SYCL)
+#if 0
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  void * ShmCommBuf ; 
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the pointer array for shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Each MPI rank should allocate our own buffer
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
+  if (ShmCommBuf == (void *)NULL ) {
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    exit(EXIT_FAILURE);  
+  }
+
+  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+
+  SharedMemoryZero(ShmCommBuf,bytes);
+
+  assert(WorldShmSize == 1);
+  for(int r=0;r<WorldShmSize;r++){
+    WorldShmCommBufs[r] = ShmCommBuf;
+  }
+  _ShmAllocBytes=bytes;
+  _ShmAlloc=1;
+}
+#endif
+
+#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)  
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -536,9 +574,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifndef ACCELERATOR_AWARE_MPI
-  HostCommBuf= malloc(bytes);
-#endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
@@ -703,6 +738,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
+#endif

 #else 
 #ifdef GRID_MPI3_SHMMMAP
@@ -926,12 +962,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
  ShmBufferFreeAll();

-#ifndef ACCELERATOR_AWARE_MPI
-  host_heap_size = heap_size;
-  HostCommBuf= GlobalSharedMemory::HostCommBuf;
-  HostBufferFreeAll();
-#endif  
-
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -29,27 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-extern std::vector<std::pair<int,int> > Cshift_table; 
-extern commVector<std::pair<int,int> > Cshift_table_device; 
+extern Vector<std::pair<int,int> > Cshift_table; 

-inline std::pair<int,int> *MapCshiftTable(void)
-{
-  // GPU version
-#ifdef ACCELERATOR_CSHIFT    
-  uint64_t sz=Cshift_table.size();
-  if (Cshift_table_device.size()!=sz )    {
-    Cshift_table_device.resize(sz);
-  }
-  acceleratorCopyToDevice((void *)&Cshift_table[0],
-			  (void *)&Cshift_table_device[0],
-			  sizeof(Cshift_table[0])*sz);
-
-  return &Cshift_table_device[0];
-#else 
-  return &Cshift_table[0];
-#endif
-  // CPU version use identify map
-}
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@@ -93,7 +74,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  }
  {
    auto buffer_p = & buffer[0];
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
@@ -244,7 +225,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  
  {
    auto buffer_p = & buffer[0];
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
@@ -316,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }

+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+
+template <typename T>
+T iDivUp(T a, T b) // Round a / b to nearest higher integer value
+{ return (a % b != 0) ? (a / b + 1) : (a / b); }
+
+template <typename T>
+__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
+{
+    int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    if (idx >= e1*e2) return;
+
+    int n, b, o;
+
+    n = idx / e2;
+    b = idx % e2;
+    o = n*stride + b;
+
+    vector[2*idx + 0] = lo + o;
+    vector[2*idx + 1] = ro + o;
+}
+
+#endif
+
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -340,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;

  if(cbmask == 0x3 ){
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+    ent = e1*e2;
+    dim3 blockSize(acceleratorThreads());
+    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
+    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
+    accelerator_barrier();
+#else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
+#endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -359,7 +372,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }

  {
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
@@ -396,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;

  if ( cbmask == 0x3 ) {
+#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
+    ent = e1*e2;
+    dim3 blockSize(acceleratorThreads());
+    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
+    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
+    accelerator_barrier();
+#else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
+#endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@@ -411,7 +432,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }

  {
-    auto table = MapCshiftTable();
+    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -52,8 +52,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);

-  RealD t1,t0;
-  t0=usecond();
+
  if ( !comm_dim ) {
    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@@ -64,8 +63,6 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
-  t1=usecond();
-  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }

@@ -130,20 +127,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
+
  for(int x=0;x<rd;x++){       

    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    
    if (comm_proc==0) {
-      tcopy-=usecond();
+
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
+
    } else {

      int words = buffer_size;
@@ -151,39 +144,26 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

      int bytes = words * sizeof(vobj);

-      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
-      tgather+=usecond();

      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);

-      tcomms-=usecond();
-      //      grid->Barrier();
+      grid->Barrier();

      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
-      xbytes+=bytes;
-      //      grid->Barrier();
-      tcomms+=usecond();

-      tscatter-=usecond();
+      grid->Barrier();
+
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
-      tscatter+=usecond();
    }
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -210,12 +190,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(shift>=0);
  assert(shift<fd);

-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
  int permute_type=grid->PermuteType(dimension);

  ///////////////////////////////////////////////
@@ -253,9 +227,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
-    tgather-=usecond();
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();

    for(int i=0;i<Nsimd;i++){
      
@@ -280,8 +252,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

-	tcomms-=usecond();
-	//	grid->Barrier();
+	grid->Barrier();

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
@@ -291,9 +262,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);

-	xbytes+=bytes;
-	//	grid->Barrier();
-	tcomms+=usecond();
+	grid->Barrier();

 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -301,17 +270,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      }

    }
-    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
+
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -331,11 +292,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
@@ -359,9 +315,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    
    if (comm_proc==0) {

-      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();

    } else {

@@ -370,9 +324,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

      int bytes = words * sizeof(vobj);

-      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
-      tgather+=usecond();

      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -380,8 +332,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);


-      tcomms-=usecond();
-      //      grid->Barrier();
+      grid->Barrier();

      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
@@ -389,24 +340,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
-      xbytes+=bytes;
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);

-      //      grid->Barrier();
-      tcomms+=usecond();
+      grid->Barrier();

-      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
-      tscatter+=usecond();
    }
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  */
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -432,11 +372,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;

  int permute_type=grid->PermuteType(dimension);

@@ -479,10 +414,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
-    tgather-=usecond();
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();

    for(int i=0;i<Nsimd;i++){
      
@@ -507,8 +440,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

-	tcomms-=usecond();
-	//	grid->Barrier();
+	grid->Barrier();

 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
@@ -517,28 +449,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
-	xbytes+=bytes;

-	//	grid->Barrier();
-	tcomms+=usecond();
+	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }

    }
-    tscatter-=usecond();
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-
  }
-  /*
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
-  */
+
 }
 #endif
 NAMESPACE_END(Grid); 
--- a/Grid/cshift/Cshift_table.cc
+++ b/Grid/cshift/Cshift_table.cc
@@ -1,5 +1,4 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
-std::vector<std::pair<int,int> > Cshift_table; 
-commVector<std::pair<int,int> > Cshift_table_device; 
+Vector<std::pair<int,int> > Cshift_table; 
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
-#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@@ -47,4 +46,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
+#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -270,42 +270,5 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L
    return axpby_norm_fast(ret,a,b,x,y);
 }

-/// Trace product
-template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
-  -> Lattice<decltype(trace(obj()))>
-{
-  typedef decltype(trace(obj())) robj;
-  Lattice<robj> ret_i(rhs_1.Grid());
-  autoView( rhs1 , rhs_1, AcceleratorRead);
-  autoView( rhs2 , rhs_2, AcceleratorRead);
-  autoView( ret , ret_i, AcceleratorWrite);
-  ret.Checkerboard() = rhs_1.Checkerboard();
-  accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
-      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
-  });
-  return ret_i;
-}
-
-template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
-  -> Lattice<decltype(trace(obj1()))>
-{
-  typedef decltype(trace(obj1())) robj;
-  Lattice<robj> ret_i(rhs_1.Grid());
-  autoView( rhs1 , rhs_1, AcceleratorRead);
-  autoView( ret , ret_i, AcceleratorWrite);
-  ret.Checkerboard() = rhs_1.Checkerboard();
-  accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
-      coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
-  });
-  return ret_i;
-}
-template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
-  -> Lattice<decltype(trace(obj1()))>
-{
-  return traceProduct(rhs_1,rhs_2);
-}
-
-
-
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -360,7 +360,7 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int g=0;g<o.Grid()->_gsites;g++){
+  for(int64_t g=0;g<o.Grid()->_gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
+#if ( (!defined(GRID_CUDA)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
+template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
  }
 }

-template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }

-#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;

 NAMESPACE_END(Grid);

--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -31,7 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
-#include <Grid/lattice/Lattice_slicesum_core.h>

 NAMESPACE_BEGIN(Grid);

@@ -204,6 +203,27 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

+
+template<class Op,class T1>
+inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr)  ->RealD
+{
+  return norm2(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr)      ->RealD
+{
+  return norm2(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)      ->RealD
+{
+  return norm2(closure(expr));
+}
+
+
 //The global maximum of the site norm2
 template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 {
@@ -281,29 +301,11 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  return nrm;
 }

-
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
-
-#ifdef GRID_SYCL
-  uint64_t csum=0;
-  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
-  {
-    // Hack
-    // Fast integer xor checksum. Can also be used in comms now.
-    autoView(l_v,left,AcceleratorRead);
-    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
-    uint64_t *base= (uint64_t *)&l_v[0];
-    csum=svm_xor(base,words);
-  }
-  FlightRecorder::CsumLog(csum);
-#endif
  ComplexD nrm = rankInnerProduct(left,right);
-  RealD local = real(nrm);
-  FlightRecorder::NormLog(real(nrm)); 
  grid->GlobalSum(nrm);
-  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }

@@ -467,10 +469,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
-  int ostride=grid->_ostride[orthogdim];

-  //Reduce Data down to lvSum
-  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  autoView( Data_v, Data, CpuRead);
+  thread_for( r,rd, {
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	lvSum[r]=lvSum[r]+Data_v[ss];
+      }
+    }
+  });

  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
@@ -514,7 +525,6 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }

-
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  auto r=hipGetDevice(&device);
+  auto discard=hipGetDevice(&device);
 #endif
  
  Iterator warpSize            = gpu_props[device].warpSize;
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -69,29 +69,28 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
  return result;
 }

-
-template<class Word> Word svm_xor(Word *vec,uint64_t L)
-{
-  Word xorResult; xorResult = 0;
-  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
-  Word identity;  identity=0;
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
-     cgh.parallel_for(cl::sycl::range<1>{L},
-		      Reduction,
-		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum ^=vec[index];
-     });
-   });
-  theGridAccelerator->wait();
-  Word ret = d_sum[0];
-  free(d_sum,*theGridAccelerator);
-  return ret;
-}
-
 NAMESPACE_END(Grid);

 /*
+template<class Double> Double svm_reduce(Double *vec,uint64_t L)
+{
+  Double sumResult; zeroit(sumResult);
+  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
+  Double identity;  zeroit(identity);
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
+     cgh.parallel_for(cl::sycl::range<1>{L},
+		      Reduction,
+		      [=] (cl::sycl::id<1> index, auto &sum) {
+	 sum +=vec[index];
+     });
+   });
+  theGridAccelerator->wait();
+  Double ret = d_sum[0];
+  free(d_sum,*theGridAccelerator);
+  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
+  return ret;
+}

 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -152,7 +152,6 @@ public:
 #ifdef RNG_FAST_DISCARD
  static void Skip(RngEngine &eng,uint64_t site)
  {
-#if 0
    /////////////////////////////////////////////////////////////////////////////////////
    // Skip by 2^40 elements between successive lattice sites
    // This goes by 10^12.
@@ -163,9 +162,9 @@ public:
    // tens of seconds per trajectory so this is clean in all reasonable cases,
    // and margin of safety is orders of magnitude.
    // We could hack Sitmo to skip in the higher order words of state if necessary
-    //
-    // Replace with 2^30 ; avoid problem on large volumes
-    //
+      //
+      // Replace with 2^30 ; avoid problem on large volumes
+      //
    /////////////////////////////////////////////////////////////////////////////////////
    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
    const int shift = 30;
@@ -180,9 +179,6 @@ public:
    assert((skip >> shift)==site); // check for overflow

    eng.discard(skip);
-#else
-    eng.discardhi(site);
-#endif
    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
  } 
 #endif
@@ -365,9 +361,14 @@ public:
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
-
-  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
-
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
+  {
+    if ( l.Grid()->_isCheckerBoarded ) {
+      Lattice<vobj> tmp(_grid);
+      fill(tmp,dist);
+      pickCheckerboard(l.Checkerboard(),l,tmp);
+      return;
+    }
    typedef typename vobj::scalar_object scalar_object;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
@@ -411,7 +412,7 @@ public:
      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
-  void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
+  void SeedFixedIntegers(const std::vector<int> &seeds){

    // Everyone generates the same seed_seq based on input seeds
    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@@ -428,9 +429,10 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
+#if 1
    thread_for( lidx, _grid->lSites(), {

-	int gidx;
+	int64_t gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
@@ -448,12 +450,29 @@ public:
 	
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
-	if ( britney ) { 
-	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
-	} else { 	
+	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+    });
+#else
+    // Everybody loops over global volume.
+    thread_for( gidx, _grid->_gsites, {
+
+	// Where is it?
+	int rank;
+	int o_idx;
+	int i_idx;
+
+	Coordinate gcoor;
+	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
+	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+	
+	// If this is one of mine we take it
+	if( rank == _grid->ThisRank() ){
+	  int l_idx=generator_idx(o_idx,i_idx);
+	  _generators[l_idx] = master_engine;
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
+#endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,213 +0,0 @@
-#pragma once
-#include <type_traits>
-#if defined(GRID_CUDA)
-
-#include <cub/cub.cuh>
-#define gpucub cub
-#define gpuError_t cudaError_t
-#define gpuSuccess cudaSuccess
-
-#elif defined(GRID_HIP)
-
-#include <hipcub/hipcub.hpp>
-#define gpucub hipcub
-#define gpuError_t hipError_t
-#define gpuSuccess hipSuccess
-
-#endif
-
-
-NAMESPACE_BEGIN(Grid);
-
-
-#if defined(GRID_CUDA) || defined(GRID_HIP)
-template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
-  size_t subvol_size = e1*e2;
-  commVector<vobj> reduction_buffer(rd*subvol_size);
-  auto rb_p = &reduction_buffer[0];
-  vobj zero_init;
-  zeroit(zero_init);
-
-  
-  void *temp_storage_array = NULL;
-  size_t temp_storage_bytes = 0;
-  vobj *d_out;
-  int* d_offsets;
-
-  std::vector<int> offsets(rd+1,0);
-
-  for (int i = 0; i < offsets.size(); i++) {
-    offsets[i] = i*subvol_size;
-  }
-  
-  //Allocate memory for output and offset arrays on device
-  d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
-  
-  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
-  
-  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
-  
-  
-  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
-  if (gpuErr!=gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //allocate memory for temp_storage_array  
-  temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
-  
-  //prepare buffer for reduction
-  //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
-  //use 2d accelerator_for to avoid launch latencies found when serially looping over rd 
-  accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{ 
-  
-    int n = s / e2;
-    int b = s % e2;
-    int so=r*ostride; // base offset for start of plane 
-    int ss= so+n*stride+b;
-
-    coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
-
-  });
-  
-  //issue segmented reductions in computeStream
-  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
-  if (gpuErr!=gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
-  
-  //sync after copy
-  accelerator_barrier();
- 
-  acceleratorFreeDevice(temp_storage_array);
-  acceleratorFreeDevice(d_out);
-  acceleratorFreeDevice(d_offsets);
-  
-
-}
-
-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
-  typedef typename vobj::vector_type vector;
-  const int words = sizeof(vobj)/sizeof(vector);
-  const int osites = rd*e1*e2;
-  commVector<vector>buffer(osites);
-  vector *dat = (vector *)Data;
-  vector *buf = &buffer[0];
-  Vector<vector> lvSum_small(rd);
-  vector *lvSum_ptr = (vector *)&lvSum[0];
-
-  for (int w = 0; w < words; w++) {
-    accelerator_for(ss,osites,1,{
-	    buf[ss] = dat[ss*words+w];
-    });
-
-    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
-      
-    for (int r = 0; r < rd; r++) {
-      lvSum_ptr[w+words*r]=lvSum_small[r];
-    }
-
-  }
-
-  
-}
-
-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
-{
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
-    if constexpr (sizeof(vobj) <= 256) { 
-      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-    }
-    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
-    }
-}
-#endif
-
-
-#if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
-{
-  typedef typename vobj::scalar_object sobj;
-  size_t subvol_size = e1*e2;
-
-  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
-  vobj vobj_zero;
-  zeroit(vobj_zero);
-    
-  commVector<vobj> reduction_buffer(rd*subvol_size);    
-
-  auto rb_p = &reduction_buffer[0];
-
-  autoView(Data_v, Data, AcceleratorRead);
-
-  //prepare reduction buffer 
-  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
-  
-      int n = s / e2;
-      int b = s % e2;
-      int so=r*ostride; // base offset for start of plane 
-      int ss= so+n*stride+b;
-
-      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
-
-  });
-
-  for (int r = 0; r < rd; r++) {
-      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
-          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
-              auto s = item[0];
-              sum += rb_p[r*subvol_size+s];
-          });
-      });
-      theGridAccelerator->wait();
-      lvSum[r] = mysum[0];
-  }
-  
-  free(mysum,*theGridAccelerator);
-}
-#endif
-
-template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
-{
-  // sum over reduced dimension planes, breaking out orthog dir
-  // Parallel over orthog direction
-  autoView( Data_v, Data, CpuRead);
-  thread_for( r,rd, {
-    int so=r*ostride; // base offset for start of plane 
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-        int ss= so+n*stride+b;
-        lvSum[r]=lvSum[r]+Data_v[ss];
-      }
-    }
-  });
-}
-
-template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
-{
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
-  
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #elif defined(GRID_SYCL)
-  
-  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #else
-  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-
-  #endif
-}
-
-
-NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -265,8 +265,8 @@ inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			   const             Lattice<vobj>   &fineData,
-			   const VLattice &Basis)
+			 const             Lattice<vobj>   &fineData,
+			 const VLattice &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
@@ -276,18 +276,65 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,

  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( ip_         , ip,         AcceleratorWrite);
+  RealD t_IP=0;
+  RealD t_co=0;
+  RealD t_za=0;
  for(int v=0;v<nbasis;v++) {
+    t_IP-=usecond();
    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
+    t_IP+=usecond();
+    t_co-=usecond();
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 	convertType(coarseData_[sc](v),ip_[sc]);
    });
+    t_co+=usecond();

    // improve numerical stability of projection
    // |fine> = |fine> - <basis|fine> |basis>
    ip=-ip;
+    t_za-=usecond();
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
+    t_za+=usecond();
  }
+  //  std::cout << GridLogPerformance << " blockProject : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
+  //  std::cout << GridLogPerformance << " blockProject : conv              :  "<<t_co<<" us"<<std::endl;
+  //  std::cout << GridLogPerformance << " blockProject : blockZaxpy        :  "<<t_za<<" us"<<std::endl;
 }
+
+
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			     const             Lattice<vobj>   &fineData,
+			     const VLattice &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<iScalar<CComplex>> ip(coarse);
+  Lattice<vobj>     fineDataRed = fineData;
+
+  autoView( coarseData_ , coarseData, AcceleratorWrite);
+  autoView( ip_         , ip,         AcceleratorWrite);
+  RealD t_IP=0;
+  RealD t_co=0;
+  for(int v=0;v<nbasis;v++) {
+    t_IP-=usecond();
+    blockInnerProductD(ip,Basis[v],fineData); // ip = <basis|fine>
+    t_IP+=usecond();
+    t_co-=usecond();
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	convertType(coarseData_[sc](v),ip_[sc]);
+    });
+    t_co+=usecond();
+  }
+  //  std::cout << GridLogPerformance << " blockProjectFast : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
+  //  std::cout << GridLogPerformance << " blockProjectFast : conv              :  "<<t_co<<" us"<<std::endl;
+}
+
+
+// This only minimises data motion from CPU to GPU
+// there is chance of better implementation that does a vxk loop of inner products to data share
+// at the GPU thread level
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               const std::vector<Lattice<vobj>> &fineData,
@@ -393,8 +440,15 @@ template<class vobj,class CComplex>
  Lattice<dotp> coarse_inner(coarse);

  // Precision promotion
+  RealD t;
+  t=-usecond();
  fine_inner = localInnerProductD<vobj>(fineX,fineY);
+  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
+  
+  t=-usecond();
  blockSum(coarse_inner,fine_inner);
+  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
+  t=-usecond();
  {
    autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
    autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@@ -402,6 +456,7 @@ template<class vobj,class CComplex>
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
    });
  }
+  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
 
 }

@@ -444,6 +499,9 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 template<class vobj>
 inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
+  const int maxsubsec=256;
+  typedef iVector<vobj,maxsubsec> vSubsec;
+
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();

@@ -463,22 +521,40 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( fineData_   , fineData, AcceleratorRead);

-  auto coarseData_p = &coarseData_[0];
-  auto fineData_p = &fineData_[0];
+  auto coarseData_p  = &coarseData_[0];
+  auto fineData_p    = &fineData_[0];
  
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

-  accelerator_for(sc,coarse->oSites(),1,{
+  vobj zz = Zero();
+
+  // Somewhat lazy calculation
+  // Find the biggest power of two subsection divisor less than or equal to maxsubsec
+  int subsec=maxsubsec;
+  int subvol;
+  subvol=blockVol/subsec;
+  while(subvol*subsec!=blockVol){
+    subsec = subsec/2;
+    subvol=blockVol/subsec;
+  };
+
+  Lattice<vSubsec> coarseTmp(coarse);
+  autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
+  auto coarseTmp_p= &coarseTmp_[0];
+  
+  // Sum within subsecs in a first kernel
+  accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
+
+      int sc=sce/subsec;
+      int e=sce%subsec;
      
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      vobj cd = Zero();
-      
-      for(int sb=0;sb<blockVol;sb++){
-
+      auto cd = coalescedRead(zz);
+      for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
 	int sf;
 	Coordinate coor_b(_ndimension);
 	Coordinate coor_f(_ndimension);
@@ -486,12 +562,21 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
 	
-	cd=cd+fineData_p[sf];
+	cd=cd+coalescedRead(fineData_p[sf]);
      }

-      coarseData_p[sc] = cd;
+      coalescedWrite(coarseTmp_[sc](e),cd);

    });
+   // Sum across subsecs in a second kernel
+   accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
+      auto cd = coalescedRead(coarseTmp_p[sc](0));
+      for(int e=1;e<subsec;e++){
+	cd=cd+coalescedRead(coarseTmp_p[sc](e));
+      }
+      coalescedWrite(coarseData_p[sc],cd);
+   });
+
  return;
 }

@@ -548,7 +633,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  blockOrthonormalize(ip,Basis);
 }

-#if 0
+#ifdef GRID_ACCELERATED
 // TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@@ -574,26 +659,37 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  autoView( fineData_   , fineData, AcceleratorWrite);
  autoView( coarseData_ , coarseData, AcceleratorRead);

+  typedef LatticeView<vobj> Vview;
+  std::vector<Vview> AcceleratorVecViewContainer_h; 
+  for(int v=0;v<nbasis;v++) {
+    AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
+  }
+  static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis); 
+  acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
+  auto Basis_p = &AcceleratorVecViewContainer[0];
  // Loop with a cache friendly loop ordering
-  accelerator_for(sf,fine->oSites(),1,{
+  Coordinate frdimensions=fine->_rdimensions;
+  Coordinate crdimensions=coarse->_rdimensions;
+  accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);

-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);

-    for(int i=0;i<nbasis;i++) {
-      /*      auto basis_ = Basis[i],  );*/
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
-    }
+    auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
+    for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
+    coalescedWrite(fineData_[sf],sum);
  });
+  for(int v=0;v<nbasis;v++) {
+    AcceleratorVecViewContainer_h[v].ViewClose();
+  }
  return;
-  
 }
 #else
+// CPU version
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -680,8 +776,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

-  static const int words=sizeof(vobj)/sizeof(vector_type);
-
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // the checks should guarantee that the operations are local
+  ////////////////////////////////////////////////////////////////////////////////////////////////
  GridBase *Fg = From.Grid();
  GridBase *Tg = To.Grid();
  assert(!Fg->_isCheckerBoarded);
@@ -695,38 +792,18 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-  // the above should guarantee that the operations are local
-  
-#if 1
-
  size_t nsite = 1;
  for(int i=0;i<nd;i++) nsite *= RegionSize[i];

-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
- 
-  thread_for(idx, nsite, {
-      Coordinate from_coor, to_coor;
-      size_t rem = idx;
-      for(int i=0;i<nd;i++){
-	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
-	from_coor[i] = base_i + FromLowerLeft[i];
-	to_coor[i] = base_i + ToLowerLeft[i];
-      }
-      
-      int foidx = Fg->oIndex(from_coor);
-      int fiidx = Fg->iIndex(from_coor);
-      int toidx = Tg->oIndex(to_coor);
-      int tiidx = Tg->iIndex(to_coor);
-      int* tt = table + 4*idx;
-      tt[0] = foidx;
-      tt[1] = fiidx;
-      tt[2] = toidx;
-      tt[3] = tiidx;
-    });
-  
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // do the index calc on the GPU
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;

  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
@@ -734,13 +811,19 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  autoView(from_v,From,AcceleratorRead);
  autoView(to_v,To,AcceleratorWrite);

+  const int words=sizeof(vobj)/sizeof(vector_type);
  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
+      
+      Coordinate from_coor, to_coor, base;
+      Lexicographic::CoorFromIndex(base,idx,RegionSize);
+      for(int i=0;i<nd;i++){
+	from_coor[i] = base[i] + FromLowerLeft[i];
+	to_coor[i] = base[i] + ToLowerLeft[i];
+      }
+      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+      int to_oidx   = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int to_lane   = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);

      const vector_type* from = (const vector_type *)&from_v[from_oidx];
      vector_type* to = (vector_type *)&to_v[to_oidx];
@@ -751,53 +834,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 	putlane(to[w], stmp, to_lane);
      }
    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-
-#else  
-  Coordinate ldf = Fg->_ldimensions;
-  Coordinate rdf = Fg->_rdimensions;
-  Coordinate isf = Fg->_istride;
-  Coordinate osf = Fg->_ostride;
-  Coordinate rdt = Tg->_rdimensions;
-  Coordinate ist = Tg->_istride;
-  Coordinate ost = Tg->_ostride;
-
-  autoView( t_v , To, CpuWrite);
-  autoView( f_v , From, CpuRead);
-  thread_for(idx,Fg->lSites(),{
-    sobj s;
-    Coordinate Fcoor(nd);
-    Coordinate Tcoor(nd);
-    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
-    int in_region=1;
-    for(int d=0;d<nd;d++){
-      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
-	in_region=0;
-      }
-      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
-    }
-    if (in_region) {
-#if 0      
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
-      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
-      scalar_type * fp = (scalar_type *)&f_v[odx_f];
-      scalar_type * tp = (scalar_type *)&t_v[odx_t];
-      for(int w=0;w<words;w++){
-	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
-      }
-#else
-    peekLocalSite(s,f_v,Fcoor);
-    pokeLocalSite(s,t_v,Tcoor);
-#endif
-    }
-  });
-
-#endif
 }


@@ -889,7 +925,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic

 }

-
+//FIXME: make this run entirely on GPU
 //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
 //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
@@ -1052,7 +1088,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)

  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int g=0;g<fg->gSites();g++){
+  for(int64_t g=0;g<fg->gSites();g++){

    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
@@ -1738,5 +1774,32 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
  }
 }

+//////////////////////////////////////////////////////
+// MultiRHS interface support for coarse space
+// -- Simplest possible implementation to begin with
+//////////////////////////////////////////////////////
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void blockProjectMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
+			     Lattice<iVector<CComplex,nbasis > > &coarseTMP,
+			     const VLattice &fineData, // Basis and fineData necessarily same type
+			     const VLattice &Basis)
+{
+  for(int r=0;r<fineData.size();r++){
+    blockProject(coarseTMP,fineData[r],Basis);
+    InsertSliceLocal(coarseTMP, coarseIP,r,r,0);
+  }
+}
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void blockPromoteMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
+			     Lattice<iVector<CComplex,nbasis > > &coarseTMP,
+			     const VLattice &fineData, // Basis and fineData necessarily same type
+			     const VLattice &Basis)
+{
+  for(int r=0;r<fineData.size();r++){
+    ExtractSliceLocal(coarseTMP, coarseIP,r,r,0);
+    blockPromote(coarseTMP,fineData[r],Basis);
+  }
+}
+
 NAMESPACE_END(Grid);

--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -45,7 +45,6 @@ public:
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
-  vobj* getHostPointer(void) const { return _odata; };
 };

 /////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -45,6 +45,188 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
 };  

+
+/*
+ *
+ * TODO: 
+ *  -- address elementsof vobj via thread block in Scatter/Gather
+ *  -- overlap comms with motion in Face_exchange
+ *
+ */
+
+template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
+					      Lattice<vobj> &lat,
+					      int x,
+					      int dim,
+					      int offset=0)
+{
+  const int Nsimd=vobj::Nsimd();
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  GridBase *grid = lat.Grid();
+  Coordinate simd = grid->_simd_layout;
+  int Nd          = grid->Nd();
+  int block       = grid->_slice_block[dim];
+  int stride      = grid->_slice_stride[dim];
+  int nblock      = grid->_slice_nblock[dim];
+  int rd          = grid->_rdimensions[dim];
+
+  int ox = x%rd;
+  int ix = x/rd;
+
+  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
+
+  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
+
+  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
+  int rNsimda= Nsimd/simd[dim]; // should be equal
+  assert(rNsimda==rNsimd);
+  int face_ovol=block*nblock;
+
+  //  assert(buf.size()==face_ovol*rNsimd);
+
+  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
+  //Let's make it work on GPU and then make a special accelerator_for that
+  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
+  //for cross platform
+  // FIXME -- can put internal indices into thread loop
+  auto buf_p = & buf[0];
+  autoView(lat_v, lat, AcceleratorWrite);
+  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
+
+    // scalar layout won't coalesce
+#ifdef GRID_SIMT
+      {
+	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int blane=0;blane<Nsimd;blane++) {
+#endif
+	int olane=blane%rNsimd;               // reduced lattice lane
+	int obit =blane/rNsimd;
+
+	///////////////////////////////////////////////////////////////
+	// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
+	///////////////////////////////////////////////////////////////
+	int ssp = ss*simd[dim]+obit;
+	int b    = ssp%block;
+	int n    = ssp/block;
+	int osite= b+n*stride + ox*block;
+	
+	////////////////////////////////////////////
+	// isite -- map lane within buffer to lane within lattice
+	////////////////////////////////////////////
+	Coordinate icoor;
+	int lane;
+	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+	icoor[dim]=ix;
+	Lexicographic::IndexFromCoor(icoor,lane,simd);
+	
+	///////////////////////////////////////////
+	// Transfer into lattice - will coalesce
+	///////////////////////////////////////////
+	//	sobj obj = extractLane(blane,buf_p[ss+offset]);
+	//	insertLane(lane,lat_v[osite],obj);
+	const int words=sizeof(vobj)/sizeof(vector_type);
+	vector_type * from = (vector_type *)&buf_p[ss+offset];
+	vector_type * to   = (vector_type *)&lat_v[osite];
+	scalar_type stmp;
+	for(int w=0;w<words;w++){
+	  stmp = getlane(from[w], blane);
+	  putlane(to[w], stmp, lane);
+	}
+      }
+  });
+}
+
+template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
+					     const Lattice<vobj> &lat,
+					     int x,
+					     int dim,
+					     int offset=0)
+{
+  const int Nsimd=vobj::Nsimd();
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  autoView(lat_v, lat, AcceleratorRead);
+
+  GridBase *grid = lat.Grid();
+  Coordinate simd = grid->_simd_layout;
+  int Nd          = grid->Nd();
+  int block       = grid->_slice_block[dim];
+  int stride      = grid->_slice_stride[dim];
+  int nblock      = grid->_slice_nblock[dim];
+  int rd          = grid->_rdimensions[dim];
+
+  int ox = x%rd;
+  int ix = x/rd;
+
+  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
+
+  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
+
+  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
+  
+  int face_ovol=block*nblock;
+
+  //  assert(buf.size()==face_ovol*rNsimd);
+
+  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
+  //Let's make it work on GPU and then make a special accelerator_for that
+  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
+  //for cross platform
+  //For CPU perhaps just run a loop over Nsimd
+  auto buf_p = & buf[0];
+  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
+
+    // scalar layout won't coalesce
+#ifdef GRID_SIMT
+      {
+	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int blane=0;blane<Nsimd;blane++) {
+#endif
+	int olane=blane%rNsimd;               // reduced lattice lane
+	int obit =blane/rNsimd;
+	
+	////////////////////////////////////////////
+	// osite
+	////////////////////////////////////////////
+	int ssp = ss*simd[dim]+obit;
+	int b    = ssp%block;
+	int n    = ssp/block;
+	int osite= b+n*stride + ox*block;
+
+	////////////////////////////////////////////
+	// isite -- map lane within buffer to lane within lattice
+	////////////////////////////////////////////
+	Coordinate icoor;
+	int lane;
+	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+	icoor[dim]=ix;
+	Lexicographic::IndexFromCoor(icoor,lane,simd);
+	
+	///////////////////////////////////////////
+	// Take out of lattice
+	///////////////////////////////////////////
+	//	sobj obj = extractLane(lane,lat_v[osite]);
+	//	insertLane(blane,buf_p[ss+offset],obj);
+	const int words=sizeof(vobj)/sizeof(vector_type);
+	vector_type * to    = (vector_type *)&buf_p[ss+offset];
+	vector_type * from  = (vector_type *)&lat_v[osite];
+	scalar_type stmp;
+	for(int w=0;w<words;w++){
+	  stmp = getlane(from[w], lane);
+	  putlane(to[w], stmp, blane);
+	}
+      }
+  });
+}
+
+
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
@@ -63,14 +245,18 @@ public:
    dims=_grid->Nd();
    AllocateGrids();
    Coordinate local     =unpadded_grid->LocalDimensions();
+    Coordinate procs     =unpadded_grid->ProcessorGrid();
    for(int d=0;d<dims;d++){
-      assert(local[d]>=depth);
+      if ( procs[d] > 1 ) assert(local[d]>=depth);
    }
  }
  void DeleteGrids(void)
  {
+    Coordinate processors=unpadded_grid->_processors;
    for(int d=0;d<grids.size();d++){
-      delete grids[d];
+      if ( processors[d] > 1 ) { 
+	delete grids[d];
+      }
    }
    grids.resize(0);
  };
@@ -81,27 +267,36 @@ public:
    Coordinate processors=unpadded_grid->_processors;
    Coordinate plocal    =unpadded_grid->LocalDimensions();
    Coordinate global(dims);
-
+    GridCartesian *old_grid = unpadded_grid;
    // expand up one dim at a time
    for(int d=0;d<dims;d++){

-      plocal[d] += 2*depth; 
+      if ( processors[d] > 1 ) { 
+	plocal[d] += 2*depth; 
      
-      for(int d=0;d<dims;d++){
-	global[d] = plocal[d]*processors[d];
+	for(int d=0;d<dims;d++){
+	  global[d] = plocal[d]*processors[d];
+	}
+
+	old_grid = new GridCartesian(global,simd,processors);
      }
-
-      grids.push_back(new GridCartesian(global,simd,processors));
+      grids.push_back(old_grid);
    }
  };
  template<class vobj>
  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
+    Coordinate processors=unpadded_grid->_processors;
+
    Lattice<vobj> out(unpadded_grid);

    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate fll(dims,depth); // depends on the MPI spread
+    // depends on the MPI spread      
+    Coordinate fll(dims,depth);
    Coordinate tll(dims,0); // depends on the MPI spread
+    for(int d=0;d<dims;d++){
+      if( processors[d]==1 ) fll[d]=0;
+    }
    localCopyRegion(in,out,fll,tll,local);
    return out;
  }
@@ -116,10 +311,22 @@ public:
    }
    return tmp;
  }
+  template<class vobj>
+  inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in) const
+  {
+    GridBase *old_grid = in.Grid();
+    int dims = old_grid->Nd();
+    Lattice<vobj> tmp = in;
+    for(int d=0;d<dims;d++){
+      tmp = ExpandPeriodic(d,tmp); // rvalue && assignment
+    }
+    return tmp;
+  }
  // expand up one dim at a time
  template<class vobj>
  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
+    Coordinate processors=unpadded_grid->_processors;
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
@@ -129,46 +336,236 @@ public:
    if(dim==0) conformable(old_grid,unpadded_grid);
    else       conformable(old_grid,grids[dim-1]);

-    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
-
    double tins=0, tshift=0;

-    // Middle bit
-    double t = usecond();
-    for(int x=0;x<local[dim];x++){
-      InsertSliceLocal(in,padded,x,depth+x,dim);
+    int islocal = 0 ;
+    if ( processors[dim] == 1 ) islocal = 1;
+
+    if ( islocal ) {
+
+      // replace with a copy and maybe grid swizzle
+      // return in;??
+      double t = usecond();
+      padded = in;
+      tins += usecond() - t;
+      
+    } else {
+
+      //////////////////////////////////////////////
+      // Replace sequence with
+      // ---------------------
+      // (i) Gather high face(s); start comms
+      // (ii) Gather low  face(s); start comms
+      // (iii) Copy middle bit with localCopyRegion
+      // (iv) Complete high face(s), insert slice(s)
+      // (iv) Complete low  face(s), insert slice(s)
+      //////////////////////////////////////////////
+      // Middle bit
+      double t = usecond();
+      for(int x=0;x<local[dim];x++){
+	InsertSliceLocal(in,padded,x,depth+x,dim);
+      }
+      tins += usecond() - t;
+    
+      // High bit
+      t = usecond();
+      shifted = cshift.Cshift(in,dim,depth);
+      tshift += usecond() - t;
+
+      t=usecond();
+      for(int x=0;x<depth;x++){
+	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
+      }
+      tins += usecond() - t;
+    
+      // Low bit
+      t = usecond();
+      shifted = cshift.Cshift(in,dim,-depth);
+      tshift += usecond() - t;
+    
+      t = usecond();
+      for(int x=0;x<depth;x++){
+	InsertSliceLocal(shifted,padded,x,x,dim);
+      }
+      tins += usecond() - t;
+
    }
-    tins += usecond() - t;
-    
-    // High bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,depth);
-    tshift += usecond() - t;
-
-    t=usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // Low bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,-depth);
-    tshift += usecond() - t;
-    
-    t = usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,x,x,dim);
-    }
-    tins += usecond() - t;
-
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    
    return padded;
  }

+  template<class vobj>
+  inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in) const
+  {
+    Coordinate processors=unpadded_grid->_processors;
+    GridBase *old_grid = in.Grid();
+    GridCartesian *new_grid = grids[dim];//These are new grids
+    Lattice<vobj>  padded(new_grid);
+    //    Lattice<vobj> shifted(old_grid);    
+    Coordinate local     =old_grid->LocalDimensions();
+    Coordinate plocal    =new_grid->LocalDimensions();
+    if(dim==0) conformable(old_grid,unpadded_grid);
+    else       conformable(old_grid,grids[dim-1]);
+
+    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+    double tins=0, tshift=0;
+
+    int islocal = 0 ;
+    if ( processors[dim] == 1 ) islocal = 1;
+
+    if ( islocal ) {
+      padded=in; // slightly different interface could avoid a copy operation
+    } else {
+      Face_exchange(in,padded,dim,depth);
+      return padded;
+    }
+    return padded;
+  }
+  template<class vobj>
+  void Face_exchange(const Lattice<vobj> &from,
+		     Lattice<vobj> &to,
+		     int dimension,int depth) const
+  {
+    typedef typename vobj::vector_type vector_type;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::scalar_object sobj;
+
+    RealD t_gather=0.0;
+    RealD t_scatter=0.0;
+    RealD t_comms=0.0;
+    RealD t_copy=0.0;
+    
+    //    std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
+    //    DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
+    GridBase *grid=from.Grid();
+    GridBase *new_grid=to.Grid();
+
+    Coordinate lds = from.Grid()->_ldimensions;
+    Coordinate nlds=   to.Grid()->_ldimensions;
+    Coordinate simd= from.Grid()->_simd_layout;
+    int ld    = lds[dimension];
+    int nld   = to.Grid()->_ldimensions[dimension];
+    const int Nsimd = vobj::Nsimd();
+
+    assert(depth<=lds[dimension]); // A must be on neighbouring node
+    assert(depth>0);   // A caller bug if zero
+    assert(ld+2*depth==nld);
+    ////////////////////////////////////////////////////////////////////////////
+    // Face size and byte calculations
+    ////////////////////////////////////////////////////////////////////////////
+    int buffer_size = 1;
+    for(int d=0;d<lds.size();d++){
+      if ( d!= dimension) buffer_size=buffer_size*lds[d];
+    }
+    buffer_size = buffer_size  / Nsimd;
+    int rNsimd = Nsimd / simd[dimension];
+    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
+
+    static cshiftVector<vobj> send_buf; 
+    static cshiftVector<vobj> recv_buf;
+    send_buf.resize(buffer_size*2*depth);    
+    recv_buf.resize(buffer_size*2*depth);
+
+    std::vector<CommsRequest_t> fwd_req;   
+    std::vector<CommsRequest_t> bwd_req;   
+
+    int words = buffer_size;
+    int bytes = words * sizeof(vobj);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Communication coords
+    ////////////////////////////////////////////////////////////////////////////
+    int comm_proc = 1;
+    int xmit_to_rank;
+    int recv_from_rank;
+    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Gather all surface terms up to depth "d"
+    ////////////////////////////////////////////////////////////////////////////
+    RealD t;
+    RealD t_tot=-usecond();
+    int plane=0;
+    for ( int d=0;d < depth ; d ++ ) {
+      int tag = d*1024 + dimension*2+0;
+
+      t=usecond();
+      GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
+      t_gather+=usecond()-t;
+
+      t=usecond();
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&send_buf[d*buffer_size], xmit_to_rank,
+				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+      t_comms+=usecond()-t;
+     }
+    for ( int d=0;d < depth ; d ++ ) {
+      int tag = d*1024 + dimension*2+1;
+
+      t=usecond();
+      GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
+      t_gather+= usecond() - t;
+
+      t=usecond();
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+      t_comms+=usecond()-t;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Copy interior -- overlap this with comms
+    ////////////////////////////////////////////////////////////////////////////
+    int Nd = new_grid->Nd();
+    Coordinate LL(Nd,0);
+    Coordinate sz = grid->_ldimensions;
+    Coordinate toLL(Nd,0);
+    toLL[dimension]=depth;
+    t=usecond();
+    localCopyRegion(from,to,LL,toLL,sz);
+    t_copy= usecond() - t;
+    
+    ////////////////////////////////////////////////////////////////////////////
+    // Scatter all faces
+    ////////////////////////////////////////////////////////////////////////////
+    plane=0;
+
+    t=usecond();
+    grid->CommsComplete(fwd_req);
+    t_comms+= usecond() - t;
+
+    t=usecond();
+    for ( int d=0;d < depth ; d ++ ) {
+      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
+    }
+    t_scatter= usecond() - t;
+
+    t=usecond();
+    grid->CommsComplete(bwd_req);
+    t_comms+= usecond() - t;
+    
+    t=usecond();
+    for ( int d=0;d < depth ; d ++ ) {
+      ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
+    }
+    t_scatter+= usecond() - t;
+    t_tot+=usecond();
+
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes  :" << depth*bytes/1e6 << "MB"<<std::endl;
+  }
+  
 };
 

 NAMESPACE_END(Grid);

+
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -179,11 +179,11 @@ extern GridLogger GridLogSolver;
 extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
-extern GridLogger GridLogDebug;
+extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogDslash;
-extern GridLogger GridLogIterative;
-extern GridLogger GridLogIntegrator;
+extern GridLogger GridLogIterative  ;
+extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern GridLogger GridLogMemory;
 extern GridLogger GridLogTracing;
@@ -191,41 +191,6 @@ extern Colours    GridLogColours;

 std::string demangle(const char* name) ;

-template<typename... Args>
-inline std::string sjoin(Args&&... args) noexcept {
-    std::ostringstream msg;
-    (msg << ... << args);
-    return msg.str();
-}
-
-/*!  @brief make log messages work like python print */
-template <typename... Args>
-inline void Grid_log(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << GridLogMessage << msg << std::endl;
-}
-
-/*!  @brief make warning messages work like python print */
-template <typename... Args>
-inline void Grid_warn(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
-}
-
-/*!  @brief make error messages work like python print */
-template <typename... Args>
-inline void Grid_error(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
-}
-
-/*!  @brief make pass messages work like python print */
-template <typename... Args>
-inline void Grid_pass(Args&&... args) {
-    std::string msg = sjoin(std::forward<Args>(args)...);
-    std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
-}
-
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];

--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -165,7 +165,7 @@ class BinaryIO {
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	
-	int global_site;
+	int64_t global_site;

 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);

@@ -175,8 +175,8 @@ class BinaryIO {

 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);

-	uint32_t gsite29   = global_site%29;
-	uint32_t gsite31   = global_site%31;
+	uint64_t gsite29   = global_site%29;
+	uint64_t gsite31   = global_site%31;
 	
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@@ -545,7 +545,9 @@ class BinaryIO {
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb)
+				       uint32_t &scidac_csumb,
+				       int control=BINARYIO_LEXICOGRAPHIC
+				       )
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -556,7 +558,7 @@ class BinaryIO {
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
 	     nersc_csum,scidac_csuma,scidac_csumb);

    GridStopWatch timer; 
@@ -582,7 +584,8 @@ class BinaryIO {
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb)
+					  uint32_t &scidac_csumb,
+					  int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -607,7 +610,7 @@ class BinaryIO {
    while (attemptsLeft >= 0)
    {
      grid->Barrier();
-      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
@@ -617,7 +620,7 @@ class BinaryIO {

        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
-        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -206,7 +206,7 @@ class GridLimeReader : public BinaryIO {
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
@@ -238,7 +238,7 @@ class GridLimeReader : public BinaryIO {
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
 	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
 	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
@@ -408,7 +408,7 @@ class GridLimeWriter : public BinaryIO
  // in communicator used by the field.Grid()
  ////////////////////////////////////////////////////
  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
@@ -459,7 +459,7 @@ class GridLimeWriter : public BinaryIO
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);

    ///////////////////////////////////////////
    // Wind forward and close the record
@@ -512,7 +512,8 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0) 
+                              const unsigned int recordScientificPrec = 0,
+			      int control=BINARYIO_LEXICOGRAPHIC)
  {
    GridBase * grid = field.Grid();

@@ -534,7 +535,7 @@ class ScidacWriter : public GridLimeWriter {
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);      // Closes message with checksum
  }
 };

@@ -553,7 +554,8 @@ class ScidacReader : public GridLimeReader {
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
+			     int control=BINARYIO_LEXICOGRAPHIC) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field.Grid();
@@ -571,7 +573,7 @@ class ScidacReader : public GridLimeReader {
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@@ -34,7 +34,7 @@ class GridTracer {
 };
 inline void tracePush(const char *name) { roctxRangePushA(name); }
 inline void tracePop(const char *name) { roctxRangePop(); }
-inline int  traceStart(const char *name) { return roctxRangeStart(name); }
+inline int  traceStart(const char *name) { roctxRangeStart(name); }
 inline void traceStop(int ID) { roctxRangeStop(ID); }
 #endif

--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -129,22 +129,6 @@ public:
  virtual ~Action(){}
 };

-template <class GaugeField >
-class EmptyAction : public Action <GaugeField>
-{
-  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
-  virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
-  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
-
-  ///////////////////////////////
-  // Logging
-  ///////////////////////////////
-  virtual std::string action_name()    { return std::string("Level Force Log"); };
-  virtual std::string LogParameters()  { return std::string("No parameters");};
-};
-
-
-
 NAMESPACE_END(Grid);

 #endif // ACTION_BASE_H
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -63,8 +63,6 @@ public:
  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
-  virtual void M(const FermionField &in, FermionField &out) ;
-  virtual void Mdag(const FermionField &in, FermionField &out) ;

 private:
  RealD mu; // TwistedMass parameter
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -280,16 +280,20 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,

  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
-    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
 #ifndef GRID_CUDA
+    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
+#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
+#endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
+#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
+#endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
@@ -318,13 +322,19 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
+#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
+#endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
+#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
+#endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
+#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
+#endif
  }
 }

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -462,7 +462,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);

   if( interior && exterior ) {
-     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
@@ -496,7 +495,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v ,st,AcceleratorRead);

   if( interior && exterior ) {
-     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
 #ifndef GRID_CUDA
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@@ -93,25 +93,5 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
  RealD b    = tm /sq;
  axpibg5x(out,in,a,b);
 }
-template<class Impl>
-void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerNo);
-  FermionField tmp(out.Grid());
-  RealD a = 4.0+this->mass;
-  RealD b = this->mu;
-  axpibg5x(tmp,in,a,b);
-  axpy(out, 1.0, tmp, out);
-}
-template<class Impl>
-void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
-  out.Checkerboard() = in.Checkerboard();
-  this->Dhop(in, out, DaggerYes);
-  FermionField tmp(out.Grid());
-  RealD a = 4.0+this->mass;
-  RealD b = -this->mu;
-  axpibg5x(tmp,in,a,b);
-  axpy(out, 1.0, tmp, out);
-}

 NAMESPACE_END(Grid);
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -87,8 +87,6 @@ public:

  const ActionSet<Field, RepresentationPolicy> as;

-  ActionSet<Field,RepresentationPolicy> LevelForces;
-  
  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
    static MomentumFilterNone<MomentaField> filter;
@@ -126,9 +124,6 @@ public:
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing

-    assert(as.size()==LevelForces.size());
-    
-    Field level_force(U.Grid()); level_force =Zero();
    for (int a = 0; a < as[level].actions.size(); ++a) {

      double start_full = usecond();
@@ -150,9 +145,6 @@ public:

      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl;
      
-      // track the total
-      level_force = level_force+force;
-
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    

@@ -175,16 +167,6 @@ public:

    }

-    {
-      // total force
-      Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
-      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
-
-      Real force_max   = std::sqrt(maxLocalNorm2(level_force));
-      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
-      LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
-    }
-
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);

@@ -234,16 +216,6 @@ public:

    //Default the momentum filter to "do-nothing"
    MomFilter = getDefaultMomFilter();
-
-    for (int level = 0; level < as.size(); ++level) {
-      int multiplier = as.at(level).multiplier;
-      ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
-      Level->push_back(new EmptyAction<Field>); 
-      LevelForces.push_back(*Level);
-      // does it copy by value or reference??
-      // - answer it copies by value, BUT the action level contains a reference that is NOT updated.
-      // Unsafe code in Guido's area
-    }
  };

  virtual ~Integrator() {}
@@ -261,14 +233,10 @@ public:

  void reset_timer(void)
  {
-    assert(as.size()==LevelForces.size());
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
-      int actionID=0;
-      assert(LevelForces.at(level).actions.size()==1);
-      LevelForces.at(level).actions.at(actionID)->reset_timer();
    }
  }
  void print_timer(void)
@@ -330,16 +298,6 @@ public:
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
-      int actionID=0;
-      std::cout << GridLogMessage 
-		  << LevelForces[level].actions.at(actionID)->action_name()
-		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t "
-		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
-		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average()
-		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average()
-		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
-		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num
-		  << std::endl;
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
@@ -361,13 +319,6 @@ public:
 	std::cout << as[level].actions.at(actionID)->LogParameters();
      }
    }
-    std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
-    for (int level = 0; level < LevelForces.size(); ++level) {
-      std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
-      for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
-	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
-      }
-    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }

@@ -449,7 +400,6 @@ public:
  RealD S(Field& U) 
  {  // here also U not used

-    assert(as.size()==LevelForces.size());
    std::cout << GridLogIntegrator << "Integrator action\n";

    RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -1,4 +1,3 @@
-
 /*!
  @file GaugeConfiguration.h
  @brief Declares the GaugeConfiguration class
@@ -7,15 +6,6 @@

 NAMESPACE_BEGIN(Grid);

-
-template<class T> void Dump(const Lattice<T> & lat,
-			    std::string s,
-			    Coordinate site = Coordinate({0,0,0,0}))
-{
-  typename T::scalar_object tmp;
-  peekSite(tmp,lat,site);
-  std::cout << " Dump "<<s<<" "<<tmp<<std::endl;
-}
 /*!
  @brief Smeared configuration masked container
  Modified for a multi-subset smearing (aka Luscher Flowed HMC)
@@ -38,101 +28,6 @@ private:
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
  typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField;

-  void BaseSmearDerivative(GaugeField& SigmaTerm,
-			   const GaugeField& iLambda,
-			   const GaugeField& U,
-			   int mmu, RealD rho)
-  {
-    // Reference
-    // Morningstar, Peardon, Phys.Rev.D69,054501(2004)
-    // Equation 75
-    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
-    // Output SigmaTerm
-
-    GridBase *grid = U.Grid();
-
-    WilsonLoops<Gimpl> WL;
-    GaugeLinkField staple(grid), u_tmp(grid);
-    GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
-    GaugeLinkField U_mu(grid), U_nu(grid);
-    GaugeLinkField sh_field(grid), temp_Sigma(grid);
-    Real rho_munu, rho_numu;
-
-    rho_munu = rho;
-    rho_numu = rho;
-    for(int mu = 0; mu < Nd; ++mu){
-      U_mu       = peekLorentz(      U, mu);
-      iLambda_mu = peekLorentz(iLambda, mu);
-
-      for(int nu = 0; nu < Nd; ++nu){
-	if(nu==mu) continue;
-
-	U_nu       = peekLorentz(      U, nu);
-
-	// Nd(nd-1) = 12 staples normally.
-	// We must compute 6 of these
-	// in FTHMC case
-	if ( (mu==mmu)||(nu==mmu) )
-	  WL.StapleUpper(staple, U, mu, nu);
-	
-	if(nu==mmu) {
-	  iLambda_nu = peekLorentz(iLambda, nu);
-
-	  temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
-	  //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
-	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
-
-	  sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
-
-	  temp_Sigma = rho_numu*sh_field*staple; //ok
-	  //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
-	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
-	}
-
-	if ( mu == mmu ) { 
-	  sh_field = Cshift(iLambda_mu, nu, 1);
-
-	  temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
-	  //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
-	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
-	}
-
-	//	staple = Zero();
-	sh_field = Cshift(U_nu, mu, 1);
-
-	temp_Sigma = Zero();
-
-	if ( mu == mmu )
-	  temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
-
-	if ( nu == mmu ) {
-	  temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
-
-	  u_tmp = adj(U_nu)*iLambda_nu;
-	  sh_field = Cshift(u_tmp, mu, 1);
-	  temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
-	}
-	
-	sh_field = Cshift(temp_Sigma, nu, -1);
-	Gimpl::AddLink(SigmaTerm, sh_field, mu);
-
-      }
-    }
-  }
-  
-  void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) {
-    GridBase *grid = U.Grid();
-    GaugeLinkField tmp_stpl(grid);
-    WilsonLoops<Gimpl> WL;
-    Cup = Zero();
-    for(int nu=0; nu<Nd; ++nu){
-      if (nu != mu) {
-	// get the staple in direction mu, nu
-	WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
-	Cup += adj(tmp_stpl*rho);
-      }
-    }
-  }
  // Adjoint vector to GaugeField force
  void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
  {
@@ -152,54 +47,27 @@ private:
    GaugeLinkField UtaU(PlaqL.Grid());
    GaugeLinkField D(PlaqL.Grid());
    AdjMatrixField Dbc(PlaqL.Grid());
-    AdjMatrixField Dbc_opt(PlaqL.Grid());
    LatticeComplex tmp(PlaqL.Grid());
    const int Ngen = SU3Adjoint::Dimension;
    Complex ci(0,1);
    ColourMatrix   ta,tb,tc;
-    RealD t=0;
-    RealD tp=0;
-    RealD tta=0;
-    RealD tpk=0;
-    t-=usecond();
+    
    for(int a=0;a<Ngen;a++) {
-      tta-=usecond();
      SU3::generator(a, ta);
-      ta = 2.0 * ci * ta;
      // Qlat Tb = 2i Tb^Grid
-      UtaU= adj(PlaqL)*ta*PlaqR; // 6ms
-      tta+=usecond();
-      ////////////////////////////////////////////
-      // Could add this entire C-loop to a projection routine
-      // for performance. Could also pick checkerboard on UtaU
-      // and set checkerboard on result for 2x perf
-      ////////////////////////////////////////////
+      UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
-	tc = 2.0*ci*tc;
-	tp-=usecond(); 
-	D = Ta( tc *UtaU); // 2ms
-#if 1
-	SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms
-#else
+	D = Ta( (2.0)*ci*tc *UtaU);
 	for(int b=0;b<Ngen;b++){
 	  SU3::generator(b, tb);
 	  tmp =-trace(ci*tb*D); 
 	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep
 	}
-#endif
-	tp+=usecond();
      }
-      //      Dump(Dbc_opt,"Dbc_opt");
-      //      Dump(Dbc,"Dbc");
-      tpk-=usecond();
-      tmp = trace(MpInvJx * Dbc_opt);
+      tmp = trace(MpInvJx * Dbc);
      PokeIndex<ColourIndex>(Fdet2,tmp,a);
-      tpk+=usecond();
    }
-    t+=usecond();
-    std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms  proj "<<tp/1e3<< " ms"
-	      << " ta "<<tta/1e3<<" ms" << " poke "<<tpk/1e3<< " ms"<<std::endl;
  }
  
  void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
@@ -211,17 +79,12 @@ private:
    ColourMatrix   tc;
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, tb);
-      tb = 2.0 * ci * tb;
-      Nx = Ta( adj(PlaqL)*tb * PlaqR );
-#if 1
-      SU3::LieAlgebraProject(NxAd,Nx,b);
-#else
+      Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, tc);
 	auto tmp =closure( -trace(ci*tc*Nx)); 
 	PokeIndex<ColourIndex>(NxAd,tmp,c,b); 
      }
-#endif
    }
  }
  void ApplyMask(GaugeField &U,int smr)
@@ -301,7 +164,8 @@ public:
    // Computes ALL the staples -- could compute one only and do it here
    RealD time;
    time=-usecond();
-    BaseSmear(Cmu, U,mu,rho);
+    this->StoutSmearing->BaseSmear(C, U);
+    Cmu = peekLorentz(C, mu);

    //////////////////////////////////////////////////////////////////
    // Assemble Luscher exp diff map J matrix 
@@ -345,36 +209,6 @@ public:
    // dJ(x)/dxe
    //////////////////////////////////////
    time=-usecond();
-#if 1
-    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
-    std::vector<AdjMatrix> TRb_s; TRb_s.resize(8);
-    AdjMatrixField tbXn(grid);
-    AdjMatrixField sumXtbX(grid);
-    AdjMatrixField t2(grid);
-    AdjMatrixField dt2(grid);
-    AdjMatrixField t3(grid);
-    AdjMatrixField dt3(grid);
-    AdjMatrixField aunit(grid);
-
-    for(int b=0;b<8;b++){
-      SU3Adjoint::generator(b, TRb_s[b]);
-      dJdX[b] = TRb_s[b];
-    }
-    aunit = ComplexD(1.0);
-    // Could put into an accelerator_for
-    X  = (-1.0)*ZxAd; 
-    t2 = X;
-    for (int j = 12; j > 1; --j) {
-      t3  = t2*(1.0 / (j + 1))  + aunit;
-      t2  = X * t3;
-      for(int b=0;b<8;b++){
-	dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1));
-      }
-    }
-    for(int b=0;b<8;b++){
-      dJdX[b] = -dJdX[b];
-    }
-#else
    std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid);
    AdjMatrixField tbXn(grid);
    AdjMatrixField sumXtbX(grid);
@@ -390,15 +224,14 @@ public:
      X  = (-1.0)*ZxAd; 
      t2 = X;
      dt2 = TRb;
-      for (int j = 12; j > 1; --j) {
-	t3  = t2*(1.0 / (j + 1))  + aunit;
+      for (int j = 20; j > 1; --j) {
+	t3 = t2*(1.0 / (j + 1))  + aunit;
 	dt3 = dt2*(1.0 / (j + 1));
 	t2 = X * t3;
 	dt2 = TRb * t3 + X * dt3;
      }
      dJdX[b] = -dt2; 
    }
-#endif  
    time+=usecond();
    std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
    /////////////////////////////////////////////////////////////////
@@ -448,8 +281,8 @@ public:
    
    for(int e =0 ; e<8 ; e++){
      LatticeComplexD tr(grid);
-      //      ColourMatrix te;
-      //      SU3::generator(e, te);
+      ColourMatrix te;
+      SU3::generator(e, te);
      tr = trace(dJdX[e] * nMpInv);
      pokeColour(dJdXe_nMpInv,tr,e);
    }
@@ -660,25 +493,20 @@ public:
    //////////////////////////////////////////////////////////////////
    // Assemble the N matrix
    //////////////////////////////////////////////////////////////////
-    double rho=this->StoutSmearing->SmearRho[1];
-    BaseSmear(Cmu, U,mu,rho);
-
+    // Computes ALL the staples -- could compute one only here
+    this->StoutSmearing->BaseSmear(C, U);
+    Cmu = peekLorentz(C, mu);
    Umu = peekLorentz(U, mu);
    Complex ci(0,1);
    for(int b=0;b<Ngen;b++) {
      SU3::generator(b, Tb);
      // Qlat Tb = 2i Tb^Grid
      Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
-      // FIXME -- replace this with LieAlgebraProject
-#if 0
-      SU3::LieAlgebraProject(Ncb,tmp,b);
-#else
      for(int c=0;c<Ngen;c++) {
 	SU3::generator(c, Tc);
 	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
 	PokeIndex<ColourIndex>(Ncb,tmp,c,b); 
      }
-#endif
    }      

    //////////////////////////////////////////////////////////////////
@@ -865,7 +693,7 @@ private:
 					  const GaugeField& GaugeK,int level) 
  {
    GridBase* grid = GaugeK.Grid();
-    GaugeField SigmaK(grid), iLambda(grid);
+    GaugeField C(grid), SigmaK(grid), iLambda(grid);
    GaugeField SigmaKPrimeA(grid);
    GaugeField SigmaKPrimeB(grid);
    GaugeLinkField iLambda_mu(grid);
@@ -873,11 +701,7 @@ private:
    GaugeLinkField SigmaKPrime_mu(grid);
    GaugeLinkField GaugeKmu(grid), Cmu(grid);
    
-    int mmu= (level/2) %Nd;
-    int cb= (level%2);
-    double rho=this->StoutSmearing->SmearRho[1];
-
-    // Can override this to do one direction only.
+    this->StoutSmearing->BaseSmear(C, GaugeK);
    SigmaK = Zero();
    iLambda = Zero();

@@ -888,38 +712,18 @@ private:
    // Could get away with computing only one polarisation here
    // int mu= (smr/2) %Nd;
    // SigmaKprime_A has only one component
-#if 0
-    BaseSmear(Cmu, GaugeK,mu,rho);
-    GaugeKmu = peekLorentz(GaugeK, mu);
-    SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
-    iQ = Ta(Cmu * adj(GaugeKmu));
-    this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
-    pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
-    pokeLorentz(iLambda, iLambda_mu, mu);
-    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
-#else
-    //    GaugeField C(grid);
-    //    this->StoutSmearing->BaseSmear(C, GaugeK);
-    //    for (int mu = 0; mu < Nd; mu++)
-    int mu =mmu;
-    BaseSmear(Cmu, GaugeK,mu,rho);
+    for (int mu = 0; mu < Nd; mu++)
    {
-      // Cmu = peekLorentz(C, mu);
+      Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
      iQ = Ta(Cmu * adj(GaugeKmu));
      this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
      pokeLorentz(iLambda, iLambda_mu, mu);
-      std::cout << " mu "<<mu<<" SigmaKPrime_mu"<<norm2(SigmaKPrime_mu)<< " iLambda_mu " <<norm2(iLambda_mu)<<std::endl;
    }
-    //    GaugeField SigmaKcopy(grid);
-    //    SigmaKcopy = SigmaK;
-    BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase
-    //    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
-    //    SigmaKcopy = SigmaKcopy - SigmaK;
-    //    std::cout << " BaseSmearDerivative fast path error" <<norm2(SigmaKcopy)<<std::endl;
-#endif
+    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase
+
    ////////////////////////////////////////////////////////////////////////////////////
    // propagate the rest of the force as identity map, just add back
    ////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -1,389 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/smearing/HISQSmearing.h
-
-Copyright (C) 2023
-
-Author: D. A. Clarke <clarke.davida@gmail.com> 
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*
-    @file HISQSmearing.h
-    @brief Declares classes related to HISQ smearing 
-*/
-
-
-#pragma once
-#include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-
-NAMESPACE_BEGIN(Grid);
-
-
-// TODO: find a way to fold this into the stencil header. need to access grid to get
-// Nd, since you don't want to inherit from QCD.h
-/*!  @brief append arbitrary shift path to shifts */
-template<typename... Args>
-void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) {
-    Coordinate shift(Nd,0);
-    generalShift(shift, dir, args...); 
-    // push_back creates an element at the end of shifts and
-    // assigns the data in the argument to it.
-    shifts.push_back(shift);
-}
-
-
-/*!  @brief figure out the stencil index from mu and nu */
-accelerator_inline int stencilIndex(int mu, int nu) {
-    // Nshifts depends on how you built the stencil
-    int Nshifts = 6;
-    return Nshifts*nu + Nd*Nshifts*mu;
-}
-
-
-/*!  @brief structure holding the link treatment */
-struct SmearingParameters{
-    SmearingParameters(){}
-    Real c_1;               // 1 link
-    Real c_naik;            // Naik term
-    Real c_3;               // 3 link
-    Real c_5;               // 5 link
-    Real c_7;               // 7 link
-    Real c_lp;              // 5 link Lepage
-    SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
-        : c_1(c1),
-          c_naik(cnaik),
-          c_3(c3),
-          c_5(c5),
-          c_7(c7),
-          c_lp(clp){}
-};
-
-
-/*!  @brief create fat links from link variables */
-template<class Gimpl> 
-class Smear_HISQ : public Gimpl {
-
-private:
-    GridCartesian* const _grid;
-    SmearingParameters _linkTreatment;
-
-public:
-
-    INHERIT_GIMPL_TYPES(Gimpl);
-    typedef typename Gimpl::GaugeField     GF;
-    typedef typename Gimpl::GaugeLinkField LF;
-    typedef typename Gimpl::ComplexField   CF;
-
-    // Don't allow default values here.
-    Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
-        : _grid(grid), 
-          _linkTreatment(c1,cnaik,c3,c5,c7,clp) {
-        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
-        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
-    }
-
-    // Allow to pass a pointer to a C-style, double array for MILC convenience
-    Smear_HISQ(GridCartesian* grid, double* coeff) 
-        : _grid(grid), 
-          _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
-        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
-        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
-    }
-
-    ~Smear_HISQ() {}
-
-    // Intent: OUT--u_smr, u_naik
-    //          IN--u_thin
-    void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
-
-        SmearingParameters lt = this->_linkTreatment;
-        auto grid = this->_grid;
-
-        // Create a padded cell of extra padding depth=1 and fill the padding.
-        int depth = 1;
-        PaddedCell Ghost(depth,grid);
-        GF Ughost = Ghost.Exchange(u_thin);
-
-        // This is where auxiliary N-link fields and the final smear will be stored. 
-        GF Ughost_fat(Ughost.Grid());
-        GF Ughost_3link(Ughost.Grid());
-        GF Ughost_5linkA(Ughost.Grid());
-        GF Ughost_5linkB(Ughost.Grid());
-
-        // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier,
-        // but these entries will not be used. 
-        std::vector<Coordinate> shifts;
-        for(int mu=0;mu<Nd;mu++)
-        for(int nu=0;nu<Nd;nu++) {
-            appendShift(shifts,mu);
-            appendShift(shifts,nu);
-            appendShift(shifts,shiftSignal::NO_SHIFT);
-            appendShift(shifts,mu,Back(nu));
-            appendShift(shifts,Back(nu));
-            appendShift(shifts,Back(mu));
-        }
-
-        // A GeneralLocalStencil has two indices: a site and stencil index 
-        GeneralLocalStencil gStencil(Ughost.Grid(),shifts);
-
-        // This is where contributions from the smearing get added together
-        Ughost_fat=Zero();
-
-        // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik.
-        for(int mu=0;mu<Nd;mu++) {
-
-            // TODO: This approach is slightly memory inefficient. It uses 25% extra memory 
-            Ughost_3link =Zero();
-            Ughost_5linkA=Zero();
-            Ughost_5linkB=Zero();
-
-            // Create the accessors
-            autoView(U_v       , Ughost       , AcceleratorRead);
-            autoView(U_fat_v   , Ughost_fat   , AcceleratorWrite);
-            autoView(U_3link_v , Ughost_3link , AcceleratorWrite);
-            autoView(U_5linkA_v, Ughost_5linkA, AcceleratorWrite);
-            autoView(U_5linkB_v, Ughost_5linkB, AcceleratorWrite);
-
-            // We infer some types that will be needed in the calculation.
-            typedef decltype(gStencil.GetEntry(0,0)) stencilElement;
-            typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix;
-
-            int Nsites = U_v.size();
-            auto gStencil_v = gStencil.View(); 
-
-            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs
-                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
-                U3matrix U0, U1, U2, U3, U4, U5, W;
-                for(int nu=0;nu<Nd;nu++) {
-                    if(nu==mu) continue;
-                    int s = stencilIndex(mu,nu);
-
-                    // The stencil gives us support points in the mu-nu plane that we will use to
-                    // grab the links we need.
-                    SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
-                    SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
-                    SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
-                    SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
-                    SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
-                    SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu      = SE5->_offset;
-
-                    // When you're deciding whether to take an adjoint, the question is: how is the
-                    // stored link oriented compared to the one you want? If I imagine myself travelling
-                    // with the to-be-updated link, I have two possible, alternative 3-link paths I can
-                    // take, one starting by going to the left, the other starting by going to the right.
-                    U0 = coalescedReadGeneralPermute(U_v[x_p_mu     ](nu),SE0->_permute,Nd);
-                    U1 = coalescedReadGeneralPermute(U_v[x_p_nu     ](mu),SE1->_permute,Nd);
-                    U2 = coalescedReadGeneralPermute(U_v[x          ](nu),SE2->_permute,Nd);
-                    U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
-                    U4 = coalescedReadGeneralPermute(U_v[x_m_nu     ](mu),SE4->_permute,Nd);
-                    U5 = coalescedReadGeneralPermute(U_v[x_m_nu     ](nu),SE4->_permute,Nd);
-
-                    //  "left"          "right"
-                    W = U2*U1*adj(U0) + adj(U5)*U4*U3;
-
-                    // Save 3-link construct for later and add to smeared field.
-                    coalescedWrite(U_3link_v[x](nu), W);
-
-                    // The index operator (x) returns the coalesced read on GPU. The view [] index returns 
-                    // a reference to the vector object. The [x](mu) returns a reference to the densely 
-                    // packed (contiguous in memory) mu-th element of the vector object. On CPU, 
-                    // coalescedRead/Write is the identity mapping assigning vector object to vector object.
-                    // But on GPU it's non-trivial and maps scalar object to vector object and vice versa.
-                    coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W);
-                }
-            })
-
-            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link 
-                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
-                U3matrix U0, U1, U2, U3, U4, U5, W;
-                int sigmaIndex = 0;
-                for(int nu=0;nu<Nd;nu++) {
-                    if(nu==mu) continue;
-                    int s = stencilIndex(mu,nu);
-                    for(int rho=0;rho<Nd;rho++) {
-                        if (rho == mu || rho == nu) continue;
-
-                        SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
-                        SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
-                        SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
-                        SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
-                        SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
-
-                        U0 = coalescedReadGeneralPermute(      U_v[x_p_mu     ](nu ),SE0->_permute,Nd);
-                        U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu     ](rho),SE1->_permute,Nd);
-                        U2 = coalescedReadGeneralPermute(      U_v[x          ](nu ),SE2->_permute,Nd);
-                        U3 = coalescedReadGeneralPermute(      U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd);
-                        U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu     ](rho),SE4->_permute,Nd);
-                        U5 = coalescedReadGeneralPermute(      U_v[x_m_nu     ](nu ),SE4->_permute,Nd);
-
-                        W  = U2*U1*adj(U0) + adj(U5)*U4*U3;
-
-                        if(sigmaIndex<3) {
-                            coalescedWrite(U_5linkA_v[x](rho), W);
-                        } else {
-                            coalescedWrite(U_5linkB_v[x](rho), W);
-                        }    
-
-                        coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W);
-                        sigmaIndex++;
-                    }
-                }
-            })
-
-            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link
-                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
-                U3matrix U0, U1, U2, U3, U4, U5, W;
-                int sigmaIndex = 0;
-                for(int nu=0;nu<Nd;nu++) {
-                    if(nu==mu) continue;
-                    int s = stencilIndex(mu,nu);
-                    for(int rho=0;rho<Nd;rho++) {
-                        if (rho == mu || rho == nu) continue;
-
-                        SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
-                        SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
-                        SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
-                        SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
-                        SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
-
-                        U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd);
-                        if(sigmaIndex<3) {
-                            U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd);
-                        } else {
-                            U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd);
-                        }  
-                        U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd);
-                        U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
-                        if(sigmaIndex<3) {
-                            U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd);
-                        } else {
-                            U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd);
-                        }  
-                        U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd);
-
-                        W  = U2*U1*adj(U0) + adj(U5)*U4*U3;
-
-                        coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W);
-                        sigmaIndex++;
-                    }
-                }
-            })
-
-        } // end mu loop
-
-        // c1, c3, c5, c7 construct contributions
-        u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin;
-
-        // Load up U and V std::vectors to access thin and smeared links.
-        std::vector<LF> U(Nd, grid);
-        std::vector<LF> V(Nd, grid);
-        std::vector<LF> Vnaik(Nd, grid);
-        for (int mu = 0; mu < Nd; mu++) {
-            U[mu] = PeekIndex<LorentzIndex>(u_thin, mu);
-            V[mu] = PeekIndex<LorentzIndex>(u_smr, mu);
-        }
-
-        for(int mu=0;mu<Nd;mu++) {
-
-            // Naik
-            Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu,
-                                    Gimpl::CovShiftForward(U[mu],mu,
-                                      Gimpl::CovShiftIdentityForward(U[mu],mu)));
-
-            // LePage
-            for (int nu_h=1;nu_h<Nd;nu_h++) {
-                int nu=(mu+nu_h)%Nd;
-                                // nu, nu, mu, Back(nu), Back(nu)
-                V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu,
-                                          Gimpl::CovShiftForward(U[nu],nu,
-                                            Gimpl::CovShiftForward(U[mu],mu,
-                                              Gimpl::CovShiftBackward(U[nu],nu,
-                                                Gimpl::CovShiftIdentityBackward(U[nu],nu)))))
-                                // Back(nu), Back(nu), mu, nu, nu
-                              + lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu,
-                                          Gimpl::CovShiftBackward(U[nu],nu,
-                                            Gimpl::CovShiftForward(U[mu],mu,
-                                              Gimpl::CovShiftForward(U[nu],nu,
-                                                Gimpl::CovShiftIdentityForward(U[nu],nu)))));
-            }
-        }
-
-        // Put V back into u_smr.
-        for (int mu = 0; mu < Nd; mu++) {
-            PokeIndex<LorentzIndex>(u_smr , V[mu]    , mu);
-            PokeIndex<LorentzIndex>(u_naik, Vnaik[mu], mu);
-        }
-    };
-
-
-    // Intent: OUT--u_proj
-    //          IN--u_mu
-    void projectU3(GF& u_proj, GF& u_mu) const {
-
-        auto grid = this->_grid;
-
-        LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid);
-        CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid), 
-           u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid);
-
-        // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8)
-        for (int mu = 0; mu < Nd; mu++) {
-            V  = PeekIndex<LorentzIndex>(u_mu, mu);
-            Q  = adj(V)*V;
-            c0 =        real(trace(Q));
-            c1 = (1/2.)*real(trace(Q*Q));
-            c2 = (1/3.)*real(trace(Q*Q*Q));
-            S  = (1/3.)*c1-(1/18.)*c0*c0;
-            if (norm2(S)<1e-28) {
-                g0 = (1/3.)*c0; g1 = g0; g2 = g1;
-            } else {
-                R     = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
-                theta = acos(R*pow(S,-1.5));
-                g0    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
-                g1    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta          );
-                g2    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
-            }
-//            if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD }
-            u     = sqrt(g0) + sqrt(g1) + sqrt(g2);
-            v     = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
-            w     = sqrt(g0*g1*g2);
-            den   = w*(u*v-w);
-            f0    = (-w*(u*u+v)+u*v*v)/den;
-            f1    = (-w-u*u*u+2.*u*v)/den;
-            f2    = u/den;
-            id_3  = 1.;
-
-            sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q;
-
-            PokeIndex<LorentzIndex>(u_proj, V*sqrtQinv, mu);
-        }
-    };
-
-
-//    void derivative(const GaugeField& Gauge) const {
-//    };
-};
-
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/smearing/Smearing.h
+++ b/Grid/qcd/smearing/Smearing.h
@@ -5,5 +5,4 @@
 #include <Grid/qcd/smearing/StoutSmearing.h>
 #include <Grid/qcd/smearing/GaugeConfiguration.h>
 #include <Grid/qcd/smearing/WilsonFlow.h>
-#include <Grid/qcd/smearing/HISQSmearing.h>

--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -69,7 +69,7 @@ public:
  /*! Construct stout smearing object from explicitly specified rho matrix */
  Smear_Stout(const std::vector<double>& rho_)
    : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
-    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl;
+    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
    }

--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -100,9 +100,6 @@ class GaugeGroup {
  using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
  template <typename vtype>
  using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
-  template <typename vtype>
-  using iSUnAlgebraMatrix =
-    iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >;
  static int su2subgroups(void) { return su2subgroups(group_name()); }

  //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -136,15 +133,6 @@ class GaugeGroup {
  typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
  typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;

-  typedef iSUnAlgebraMatrix<vComplex>  vAlgebraMatrix;
-  typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF;
-  typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD;
-
-  typedef Lattice<vAlgebraMatrix>  LatticeAlgebraMatrix;
-  typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF;
-  typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD;
-  
-
  typedef iSU2Matrix<Complex> SU2Matrix;
  typedef iSU2Matrix<ComplexF> SU2MatrixF;
  typedef iSU2Matrix<ComplexD> SU2MatrixD;
@@ -172,7 +160,7 @@ class GaugeGroup {
    return generator(lieIndex, ta, group_name());
  }

-  static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
+  static void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
    return su2SubGroupIndex(i1, i2, su2_index, group_name());
  }

@@ -401,52 +389,6 @@ class GaugeGroup {
    }
  }

-// Ta are hermitian (?)
-// Anti herm is i Ta basis
-static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b)
-{
-  conformable(in, out);
-  GridBase *grid = out.Grid();
-  LatticeComplex tmp(grid);
-  Matrix ta;
-  // Using Luchang's projection convention
-  //  2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a
-  autoView(out_v,out,AcceleratorWrite);
-  autoView(in_v,in,AcceleratorRead);
-  int N = ncolour;
-  int NNm1 = N * (N - 1);
-  int hNNm1= NNm1/2;
-  RealD sqrt_2 = sqrt(2.0);
-  Complex ci(0.0,1.0);
-  for(int su2Index=0;su2Index<hNNm1;su2Index++){
-    int i1, i2;
-    su2SubGroupIndex(i1, i2, su2Index);
-    int ax = su2Index*2;
-    int ay = su2Index*2+1;
-    accelerator_for(ss,grid->oSites(),1,{
-	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
-	// trace( Ta x Ci in)
-	// Bet I need to move to real part with mult by -i
-	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
-	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
-      });
-  }
-  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
-    int k = diagIndex + 1; // diagIndex starts from 0
-    int a = NNm1+diagIndex;
-    RealD scale = 1.0/sqrt(2.0*k*(k+1));
-    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
-	auto tmp = in_v[ss]()()(0,0);
-	for(int i=1;i<k;i++){
-	  tmp=tmp+in_v[ss]()()(i,i);
-	}
-	tmp = tmp - in_v[ss]()()(k,k)*k;
-	out_v[ss]()()(a,b) =imag(tmp) * scale;
-      });
-    }
-}
-
-  
 };
    
 template <int ncolour>
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -10,7 +10,6 @@
 // doesn't get found by the scripts/filelist during bootstrapping.

 private:
-
 template <ONLY_IF_SU>
 static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; }
 ////////////////////////////////////////////////////////////////////////
@@ -577,4 +576,3 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie
  LieRandomize(pRNG,g,1.0);
  GaugeTransform<Gimpl>(Umu,g);
 }
-
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -464,8 +464,7 @@ public:
  //U_padded: the gauge link fields padded out using the PaddedCell class
  //Cell: the padded cell class
  //gStencil: the precomputed generalized local stencil for the staple
-  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil)
-  {
+  static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
    double t0 = usecond();
    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
@@ -488,9 +487,9 @@ public:
    for(int mu=0;mu<Nd;mu++){
      { //view scope
 	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
+	auto gStencil_v = gStencil.View(AcceleratorRead);
 	
-	accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	    stencil_ss = Zero();
 	    int off = outer_off;
@@ -1200,9 +1199,9 @@ public:

      { //view scope
 	autoView( gStaple_v , gStaple, AcceleratorWrite);
-	auto gStencil_v = gStencil.View();
+	auto gStencil_v = gStencil.View(AcceleratorRead);

-	accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
 	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
 	    stencil_ss = Zero();
 	    int s=offset;
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -1130,16 +1130,15 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 #endif
 #endif

+// Fixme need coalesced read gpermute
+template<class vobj> void gpermute(vobj & inout,int perm){
+  vobj tmp=inout;
+  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
+  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
+  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
+  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
+}

 NAMESPACE_END(Grid);

-#ifdef GRID_SYCL
-template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
-template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
-template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
-template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
-template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
-#endif
-
-
 #endif
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
@@ -218,10 +218,6 @@ public:
    // -------------------------------------------------
    // misc
    // -------------------------------------------------
-    void discardhi(uint64_t z) {
-      _s[3] += z;
-      encrypt_counter();
-    }
    
    // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
    // Advances e’s state ei to ei+z by any means equivalent to z
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -32,7 +32,12 @@ NAMESPACE_BEGIN(Grid);
 struct GeneralStencilEntry { 
  uint64_t _offset;            // 4 bytes 
  uint8_t _permute;            // 1 bytes // Horrible alignment properties
+  uint8_t _wrap;               // 1 bytes // Horrible alignment properties
 };
+struct GeneralStencilEntryReordered : public GeneralStencilEntry {
+  uint64_t _input;
+};
+
 // Could pack to 8 + 4 + 4 = 128 bit and use 

 class GeneralLocalStencilView {
@@ -43,10 +48,10 @@ class GeneralLocalStencilView {
  int                               _npoints; // Move to template param?
  GeneralStencilEntry*  _entries_p;

-  accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const { 
+  accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) { 
    return & this->_entries_p[point+this->_npoints*osite]; 
  }
-
+  void ViewClose(void){};
 };
 ////////////////////////////////////////
 // The Stencil Class itself
@@ -61,7 +66,7 @@ protected:
 public: 
  GridBase *Grid(void) const { return _grid; }

-  View_type View(void) const {
+  View_type View(int mode) const {
    View_type accessor(*( (View_type *) this));
    return accessor;
  }
@@ -101,17 +106,23 @@ public:
 	  // Simpler version using icoor calculation
 	  ////////////////////////////////////////////////
 	  SE._permute =0;
+	  SE._wrap=0;
 	  for(int d=0;d<Coor.size();d++){

 	    int fd = grid->_fdimensions[d];
 	    int rd = grid->_rdimensions[d];
+	    int ld = grid->_ldimensions[d];
 	    int ly = grid->_simd_layout[d];

-	    assert((ly==1)||(ly==2));
+	    assert((ly==1)||(ly==2)||(ly==grid->Nsimd()));

 	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
 	    int x = Coor[d];                // x in [0... rd-1] as an oSite 

+	    if ( (x + shift)%fd != (x+shift)%ld ){
+	      SE._wrap = 1;
+	    }
+	    
 	    int permute_dim  = grid->PermuteDim(d);
 	    int permute_slice=0;
 	    if(permute_dim){    
@@ -137,55 +148,5 @@ public:
  
 };

-
-////////////////////////////////////////////////
-// Some machinery to streamline making a stencil 
-////////////////////////////////////////////////
-
-class shiftSignal {
-public:
-    enum {
-        BACKWARD_CONST = 16,
-        NO_SHIFT       = -1
-    };
-};
-
-// TODO: put a check somewhere that BACKWARD_CONST > Nd!
-
-/*!  @brief signals that you want to go backwards in direction dir */
-inline int Back(const int dir) {
-    // generalShift will use BACKWARD_CONST to determine whether we step forward or 
-    // backward. Trick inspired by SIMULATeQCD. 
-    return dir + shiftSignal::BACKWARD_CONST;
-}
-
-/*!  @brief shift one unit in direction dir */
-template<typename... Args>
-void generalShift(Coordinate& shift, int dir) {
-    if (dir >= shiftSignal::BACKWARD_CONST) {
-        dir -= shiftSignal::BACKWARD_CONST;
-        shift[dir]+=-1;
-    } else if (dir == shiftSignal::NO_SHIFT) {
-        ; // do nothing
-    } else {
-        shift[dir]+=1;
-    }
-}
-
-/*!  @brief follow a path of directions, shifting one unit in each direction */
-template<typename... Args>
-void generalShift(Coordinate& shift, int dir, Args... args) {
-    if (dir >= shiftSignal::BACKWARD_CONST) {
-        dir -= shiftSignal::BACKWARD_CONST;
-        shift[dir]+=-1;
-    } else if (dir == shiftSignal::NO_SHIFT) {
-        ; // do nothing
-    } else {
-        shift[dir]+=1;
-    }
-    generalShift(shift, args...);
-}
-
-
 NAMESPACE_END(Grid);

--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -70,6 +70,57 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);

+/*
+template<class vobj,class cobj,class compressor>
+void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline));
+
+template<class vobj,class cobj,class compressor>
+void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
+{
+  int num=table.size();
+  std::pair<int,int> *table_v = & table[0];
+
+  auto rhs_v = rhs.View(AcceleratorRead);
+  accelerator_forNB( i,num, vobj::Nsimd(), {
+    compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
+  });
+  rhs_v.ViewClose();
+}
+
+///////////////////////////////////////////////////////////////////
+// Gather for when there *is* need to SIMD split with compression
+///////////////////////////////////////////////////////////////////
+template<class cobj,class vobj,class compressor>
+void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
+				 commVector<cobj *> pointers,
+				 int dimension,int plane,
+				 int cbmask,compressor &compress,int type) __attribute__((noinline));
+
+template<class cobj,class vobj,class compressor>
+void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
+				 const Lattice<vobj> &rhs,
+				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
+				 compressor &compress,int type)
+{
+  assert( (table.size()&0x1)==0);
+  int num=table.size()/2;
+  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
+
+  auto rhs_v = rhs.View(AcceleratorRead);
+  auto rhs_p = &rhs_v[0];
+  auto p0=&pointers[0][0];
+  auto p1=&pointers[1][0];
+  auto tp=&table[0];
+  accelerator_forNB(j, num, vobj::Nsimd(), {
+      compress.CompressExchange(p0,p1, rhs_p, j,
+				so+tp[2*j  ].second,
+				so+tp[2*j+1].second,
+				type);
+  });
+  rhs_v.ViewClose();
+}
+*/
+
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -207,10 +258,6 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
-#ifndef ACCELERATOR_AWARE_MPI
-    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
-    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
-#endif
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
@@ -277,7 +324,7 @@ public:
  Vector<int> surface_list;

  stencilVector<StencilEntry>  _entries; // Resident in managed memory
-  commVector<StencilEntry>     _entries_device; // Resident in device memory
+  commVector<StencilEntry>     _entries_device; // Resident in managed memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
@@ -361,16 +408,33 @@ public:
  // Use OpenMP Tasks for cleaner ???
  // must be called *inside* parallel region
  //////////////////////////////////////////
+  /*
+  void CommunicateThreaded()
+  {
+#ifdef GRID_OMP
+    int mythread = omp_get_thread_num();
+    int nthreads = CartesianCommunicator::nCommThreads;
+#else
+    int mythread = 0;
+    int nthreads = 1;
+#endif
+    if (nthreads == -1) nthreads = 1;
+    if (mythread < nthreads) {
+      for (int i = mythread; i < Packets.size(); i += nthreads) {
+	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+						      Packets[i].to_rank,
+						      Packets[i].recv_buf,
+						      Packets[i].from_rank,
+						      Packets[i].bytes,i);
+      }
+    }
+  }
+  */
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    // All GPU kernel tasks must complete
-    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
-                               // But the HaloGather had a barrier too.
-#ifdef ACCELERATOR_AWARE_MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@@ -379,54 +443,16 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
-#else
-#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
-    for(int i=0;i<Packets.size();i++){
-      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
-      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
-      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
-      if ( Packets[i].do_send ) {
-	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
-      }
-      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].host_send_buf,
-					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].host_recv_buf,
-					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
-    }
-#endif
-    // Get comms started then run checksums
-    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
-    for(int i=0;i<Packets.size();i++){
-      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
-    }
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
-    //    accelerator_barrier(); 
+    acceleratorCopySynchronise();
    _grid->StencilBarrier(); 
-#ifndef ACCELERATOR_AWARE_MPI
-#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
-    for(int i=0;i<Packets.size();i++){
-      if ( Packets[i].do_recv ) {
-	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
-      }
-    }
-    _grid->HostBufferFreeAll();
-#endif
-    // run any checksums
-    for(int i=0;i<Packets.size();i++){
-      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
-    }
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@@ -502,7 +528,6 @@ public:
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
-    //    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@@ -515,9 +540,10 @@ public:
      compress.Point(point);
      HaloGatherDir(source,compress,point,face_idx);
    }
-    accelerator_barrier(); // All my local gathers are complete
+    accelerator_barrier();
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
+
  }

  /////////////////////////
@@ -553,7 +579,6 @@ public:
      accelerator_forNB(j, words, cobj::Nsimd(), {
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
-      acceleratorFenceComputeStream();
    }
  }
  
@@ -644,7 +669,6 @@ public:
    for(int i=0;i<dd.size();i++){
      decompressor::DecompressFace(decompress,dd[i]);
    }
-    acceleratorFenceComputeStream(); // dependent kernels
  }
  ////////////////////////////////////////
  // Set up routines
@@ -682,7 +706,7 @@ public:
 	}
      }
    }
-    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -737,8 +761,7 @@ public:
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
-		   Parameters p=Parameters(),
-		   bool preserve_shm=false)
+		   Parameters p=Parameters())
  {
    face_table_computed=0;
    _grid    = grid;
@@ -832,9 +855,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();

-    // Allow for multiple stencils to exist simultaneously
-    if (!preserve_shm)
-      _grid->ShmBufferFreeAll();
+    _grid->ShmBufferFreeAll();

    int maxl=2;
    u_simd_send_buf.resize(maxl);
@@ -1200,6 +1221,7 @@ public:
 	  ///////////////////////////////////////////////////////////
 	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
--- a/Grid/tensors/Tensor_trace.h
+++ b/Grid/tensors/Tensor_trace.h
@@ -69,35 +69,6 @@ accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(t
  }
  return ret;
 }
-////////////////////////////
-// Fast path traceProduct
-////////////////////////////
-template<class S1 , class S2, IfNotGridTensor<S1> = 0, IfNotGridTensor<S2> = 0>
-accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2)
-  -> decltype(arg1*arg2)
-{
-  return arg1*arg2;
-}
-
-template<class vtype,class rtype,int N >
-accelerator_inline auto traceProduct(const iMatrix<vtype,N> &arg1,const iMatrix<rtype,N> &arg2) -> iScalar<decltype(trace(arg1._internal[0][0]*arg2._internal[0][0]))>
-{
-  iScalar<decltype( trace(arg1._internal[0][0]*arg2._internal[0][0] )) > ret;
-  zeroit(ret._internal);
-  for(int i=0;i<N;i++){
-  for(int j=0;j<N;j++){
-    ret._internal=ret._internal+traceProduct(arg1._internal[i][j],arg2._internal[j][i]);
-  }}
-  return ret;
-}
-
-template<class vtype,class rtype >
-accelerator_inline auto traceProduct(const iScalar<vtype> &arg1,const iScalar<rtype> &arg2) -> iScalar<decltype(trace(arg1._internal*arg2._internal))>
-{
-  iScalar<decltype(trace(arg1._internal*arg2._internal))> ret;
-  ret._internal=traceProduct(arg1._internal,arg2._internal);
-  return ret;
-}

 NAMESPACE_END(Grid);

--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -34,12 +34,9 @@ NAMESPACE_BEGIN(Grid);

  // These are the Grid tensors
  template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; };
-  template<class T>        struct isGridTensor<iScalar<T> >   : public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iVector<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
-  template<class T, int N> struct isGridTensor<iMatrix<T, N> >: public std::true_type  { static constexpr bool notvalue = false; };
-
-  template <typename T>  using IfGridTensor    = Invoke<std::enable_if<isGridTensor<T>::value, int> >;
-  template <typename T>  using IfNotGridTensor = Invoke<std::enable_if<!isGridTensor<T>::value, int> >;
+  template<class T>        struct isGridTensor<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };
+  template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; };

  // Traits to identify scalars
  template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; };
@@ -404,12 +401,3 @@ NAMESPACE_BEGIN(Grid);
  };
 NAMESPACE_END(Grid);

-
-#ifdef GRID_SYCL
-template<typename T> struct
-sycl::is_device_copyable<T, typename std::enable_if<
-			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
-			      void>::type>
-  : public std::true_type {};
-#endif
-
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -7,8 +7,6 @@ uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};

-#define ENV_LOCAL_RANK_PALS    "PALS_LOCAL_RANKID"
-#define ENV_RANK_PALS          "PALS_RANKID"
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID"
@@ -149,7 +147,7 @@ void acceleratorInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
-    auto r=hipGetDeviceProperties(&gpu_props[i], i);
+    hipGetDeviceProperties(&gpu_props[i], i);
    hipDeviceProp_t prop; 
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
@@ -230,17 +228,8 @@ void acceleratorInit(void)
  {
    rank = atoi(localRankStr);		
  }
-  if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL)
-  {
-    rank = atoi(localRankStr);		
-  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
-  if ((localRankStr = getenv(ENV_RANK_PALS   )) != NULL) { world_rank = atoi(localRankStr);}
-
-  char hostname[HOST_NAME_MAX+1];
-  gethostname(hostname, HOST_NAME_MAX+1);
-  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);

  auto devices = cl::sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){
@@ -252,10 +241,9 @@ void acceleratorInit(void)
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());

 #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
-    if ( world_rank == 0) {

-      GPU_PROP_STR(vendor);
-      GPU_PROP_STR(version);
+    GPU_PROP_STR(vendor);
+    GPU_PROP_STR(version);
    //    GPU_PROP_STR(device_type);
    /*
    GPU_PROP(max_compute_units);
@@ -271,8 +259,7 @@ void acceleratorInit(void)
    GPU_PROP(single_fp_config);
    */
    //    GPU_PROP(double_fp_config);
-      GPU_PROP(global_mem_size);
-    }
+    GPU_PROP(global_mem_size);

  }
  if ( world_rank == 0 ) {
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -137,6 +137,18 @@ inline void cuda_mem(void)
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
  }
+#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
+  {									\
+    int nt=acceleratorThreads();					\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+  }

 #define accelerator_for6dNB(iter1, num1,				\
                            iter2, num2,				\
@@ -157,6 +169,20 @@ inline void cuda_mem(void)
    Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }

+
+#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
+  {									\
+    int nt=acceleratorThreads();					\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
+  }
+
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
@@ -168,6 +194,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
    Lambda(x,y,z);
  }
 }
+template<typename lambda>  __global__
+void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
+{
+  // Weird permute is to make lane coalesce for large blocks
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
+  if ( (x < num1) && (y<num2) && (z<num3) ) {
+    Lambda(x,y,z);
+  }
+}

 template<typename lambda>  __global__
 void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@@ -208,6 +245,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    assert(0);
  }
  return ptr;
 };
@@ -225,8 +263,6 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
@@ -255,13 +291,17 @@ inline int  acceleratorIsCommunicable(void *ptr)
 #define GRID_SYCL_LEVEL_ZERO_IPC

 NAMESPACE_END(Grid);
-
-// Force deterministic reductions
-#define SYCL_REDUCTION_DETERMINISTIC
+#if 0
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
+#include <level_zero/ze_api.h>
+#include <CL/sycl/backend/level_zero.hpp>
+#else
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
+#endif

 NAMESPACE_BEGIN(Grid);

@@ -285,24 +325,23 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-    unsigned long nt=acceleratorThreads();				\
-    if(nt < 8)nt=8;							\
-    unsigned long unum1 = num1;						\
-    unsigned long unum2 = num2;						\
-    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
-    cl::sycl::range<3> local {nt,1,nsimd};				\
-    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
-    cgh.parallel_for(							\
-		     cl::sycl::nd_range<3>(global,local),		\
-		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
-		     [[intel::reqd_sub_group_size(16)]]			\
-		     {							\
-		       auto iter1    = item.get_global_id(0);		\
-		       auto iter2    = item.get_global_id(1);		\
-		       auto lane     = item.get_global_id(2);		\
-		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
-		     });						\
-  });
+      unsigned long nt=acceleratorThreads();				\
+      unsigned long unum1 = num1;					\
+      unsigned long unum2 = num2;					\
+      if(nt < 8)nt=8;							\
+      cl::sycl::range<3> local {nt,1,nsimd};				\
+      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
+      cgh.parallel_for(					\
+      cl::sycl::nd_range<3>(global,local), \
+      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
+      [[intel::reqd_sub_group_size(16)]]	      \
+      {						      \
+      auto iter1    = item.get_global_id(0);	      \
+      auto iter2    = item.get_global_id(1);	      \
+      auto lane     = item.get_global_id(2);	      \
+      { __VA_ARGS__ };				      \
+     });	   			              \
+    });

 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }

@@ -404,7 +443,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    auto r=hipStreamSynchronize(computeStream);			\
+    auto tmp=hipStreamSynchronize(computeStream);		\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -437,21 +476,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

-inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
+inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); };
+inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

 #endif

@@ -461,6 +498,9 @@ inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyS
 #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
 // FIXME -- the non-blocking nature got broken March 30 2023 by PAB
 #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );  
+#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
+  prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
+  accelerator_barrier(dummy);

 #define accelerator_for( iter, num, nsimd, ... )		\
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
@@ -576,11 +616,4 @@ accelerator_inline void acceleratorFence(void)
  return;
 }

-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
-{
-  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
-  acceleratorCopySynchronise();
-}
-
-
 NAMESPACE_END(Grid);
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -94,6 +94,13 @@ static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;

 typedef AcceleratorVector<int,MaxDims> Coordinate;

+template<class T,int _ndim>
+inline bool operator==(const AcceleratorVector<T,_ndim> &v,const AcceleratorVector<T,_ndim> &w)
+{
+  if (v.size()!=w.size()) return false;
+  for(int i=0;i<v.size();i++) if ( v[i]!=w[i] ) return false;
+  return true;
+}
 template<class T,int _ndim>
 inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_ndim> &v)
 {
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -1,339 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/Init.cc
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@MacBook-Pro.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-NAMESPACE_BEGIN(Grid);
-///////////////////////////////////////////////////////
-// Grid Norm logging for repro testing
-///////////////////////////////////////////////////////
-int FlightRecorder::PrintEntireLog;
-int FlightRecorder::ContinueOnFail;
-int FlightRecorder::LoggingMode;
-int FlightRecorder::ChecksumComms;
-int FlightRecorder::ChecksumCommsSend;
-int32_t  FlightRecorder::XmitLoggingCounter;
-int32_t  FlightRecorder::RecvLoggingCounter;
-int32_t  FlightRecorder::CsumLoggingCounter;
-int32_t  FlightRecorder::NormLoggingCounter;
-int32_t  FlightRecorder::ReductionLoggingCounter;
-uint64_t FlightRecorder::ErrorCounter;
-std::vector<double> FlightRecorder::NormLogVector;
-std::vector<double> FlightRecorder::ReductionLogVector;
-std::vector<uint64_t> FlightRecorder::CsumLogVector;
-std::vector<uint64_t> FlightRecorder::XmitLogVector;
-std::vector<uint64_t> FlightRecorder::RecvLogVector;
-
-void FlightRecorder::ResetCounters(void)
-{
-  XmitLoggingCounter=0;
-  RecvLoggingCounter=0;
-  CsumLoggingCounter=0;
-  NormLoggingCounter=0;
-  ReductionLoggingCounter=0;
-}
-void FlightRecorder::Truncate(void)
-{
-  ResetCounters();
-  XmitLogVector.resize(0);
-  RecvLogVector.resize(0);
-  NormLogVector.resize(0);
-  CsumLogVector.resize(0);
-  ReductionLogVector.resize(0);
-}
-void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
-{
-  switch ( mode ) {
-  case LoggingModePrint:
-    SetLoggingModePrint();
-    break;
-  case LoggingModeRecord:
-    SetLoggingModeRecord();
-    break;
-  case LoggingModeVerify:
-    SetLoggingModeVerify();
-    break;
-  case LoggingModeNone:
-    LoggingMode = mode;
-    Truncate();
-    break;
-  default:
-    assert(0);
-  }
-}
-
-void FlightRecorder::SetLoggingModePrint(void)
-{
-  std::cout << " FlightRecorder: set to print output " <<std::endl;
-  Truncate();
-  LoggingMode = LoggingModePrint;
-}
-void FlightRecorder::SetLoggingModeRecord(void)
-{
-  std::cout << " FlightRecorder: set to RECORD " <<std::endl;
-  Truncate();
-  LoggingMode = LoggingModeRecord;
-}
-void FlightRecorder::SetLoggingModeVerify(void)
-{
-  std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
-  ResetCounters();
-  LoggingMode = LoggingModeVerify;
-}
-uint64_t FlightRecorder::ErrorCount(void)
-{
-  return ErrorCounter;
-}
-void FlightRecorder::NormLog(double value)
-{
-  uint64_t hex = * ( (uint64_t *)&value );
-  if(LoggingMode == LoggingModePrint) {
-    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
-    NormLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeRecord) {
-    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
-    NormLogVector.push_back(value);
-    NormLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeVerify) {
-
-    if(NormLoggingCounter < NormLogVector.size()){
-      uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
-
-      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
-
-	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
-		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
-		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
-
-	std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
-
-	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
-		GridHostname(),
-		GlobalSharedMemory::WorldShmRank,
-		NormLoggingCounter,NormLogVector.size(),
-		value, NormLogVector[NormLoggingCounter]); fflush(stderr);
-
-	if(!ContinueOnFail)assert(0); // Force takedown of job
-	  
-	ErrorCounter++;
-      } else {
-	if ( PrintEntireLog ) { 
-	  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
-		   <<" "<<hex<<" "<<hexref
-		   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
-	}
-      }
-       
-    }
-    if ( NormLogVector.size()==NormLoggingCounter ) {
-      std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
-    }
-    NormLoggingCounter++;
-  }
-}
-void FlightRecorder::CsumLog(uint64_t hex)
-{
-  if(LoggingMode == LoggingModePrint) {
-    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
-    CsumLoggingCounter++;
-  }
-
-  if(LoggingMode == LoggingModeRecord) {
-    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
-    CsumLogVector.push_back(hex);
-    CsumLoggingCounter++;
-  }
-
-  if(LoggingMode == LoggingModeVerify) {
-    
-    if(CsumLoggingCounter < CsumLogVector.size()) {
-
-      uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ;
-
-      if ( hex != hexref ) {
-
-        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
-		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
-
-	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
-		GridHostname(),
-		GlobalSharedMemory::WorldShmRank,
-		CsumLoggingCounter,hex, hexref);
-	fflush(stderr);
-
-	if(!ContinueOnFail) assert(0); // Force takedown of job
-	  
-	ErrorCounter++;
-
-      } else {
-
-	if ( PrintEntireLog ) { 
-	  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
-		   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
-	}
-      }
-    }  
-    if ( CsumLogVector.size()==CsumLoggingCounter ) {
-      std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
-    }
-    CsumLoggingCounter++;
-  }
-}
-void FlightRecorder::ReductionLog(double local,double global)
-{
-  uint64_t hex_l = * ( (uint64_t *)&local );
-  uint64_t hex_g = * ( (uint64_t *)&global );
-  if(LoggingMode == LoggingModePrint) {
-    std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
-    ReductionLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeRecord) {
-    std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
-    ReductionLogVector.push_back(global);
-    ReductionLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeVerify) {
-    if(ReductionLoggingCounter < ReductionLogVector.size()){
-      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
-	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
-		GridHostname(),
-		GlobalSharedMemory::WorldShmRank,
-		ReductionLoggingCounter,ReductionLogVector.size(),
-		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
-	
-	if ( !ContinueOnFail ) assert(0);
-
-	ErrorCounter++;
-      } else {
-	if ( PrintEntireLog ) { 
-	  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
-	}
-      }
-    }
-    if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
-      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
-    }
-    ReductionLoggingCounter++;
-  }
-}
-void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
-{
-  if ( ChecksumCommsSend ){
-  uint64_t *ubuf = (uint64_t *)buf;
-  if(LoggingMode == LoggingModeNone) return;
-#ifdef GRID_SYCL
-  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
-  if(LoggingMode == LoggingModePrint) {
-    std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
-    XmitLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeRecord) {
-    std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
-    XmitLogVector.push_back(_xor);
-    XmitLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeVerify) {
-    if(XmitLoggingCounter < XmitLogVector.size()){
-      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
-	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
-		GridHostname(),
-		GlobalSharedMemory::WorldShmRank,
-		XmitLoggingCounter,XmitLogVector.size(),
-		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
-	
-	if ( !ContinueOnFail ) assert(0);
-
-	ErrorCounter++;
-      } else {
-	if ( PrintEntireLog ) { 
-	  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl;
-	}
-      }
-    }
-    if ( XmitLogVector.size()==XmitLoggingCounter ) {
-      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
-    }
-    XmitLoggingCounter++;
-  }
-#endif
-  } else {
-    uint64_t word = 1;
-    deviceVector<uint64_t> dev(1);
-    acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
-    acceleratorCopySynchronise();
-    MPI_Barrier(MPI_COMM_WORLD);
-  }
-}
-void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
-{
-  if ( ChecksumComms ){
-  uint64_t *ubuf = (uint64_t *)buf;
-  if(LoggingMode == LoggingModeNone) return;
-#ifdef GRID_SYCL
-  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
-  if(LoggingMode == LoggingModePrint) {
-    std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
-    RecvLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeRecord) {
-    std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
-    RecvLogVector.push_back(_xor);
-    RecvLoggingCounter++;
-  }
-  if(LoggingMode == LoggingModeVerify) {
-    if(RecvLoggingCounter < RecvLogVector.size()){
-      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
-	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
-		GridHostname(),
-		GlobalSharedMemory::WorldShmRank,
-		RecvLoggingCounter,RecvLogVector.size(),
-		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
-	
-	if ( !ContinueOnFail ) assert(0);
-
-	ErrorCounter++;
-      } else {
-	if ( PrintEntireLog ) { 
-	  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl;
-	}
-      }
-    }
-    if ( RecvLogVector.size()==RecvLoggingCounter ) {
-      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
-    }
-    RecvLoggingCounter++;
-  }
-#endif
-  }
-}
-
-NAMESPACE_END(Grid);
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@@ -1,43 +0,0 @@
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-class FlightRecorder {
- public:
-  enum LoggingMode_t {
-    LoggingModeNone,
-    LoggingModePrint,
-    LoggingModeRecord,
-    LoggingModeVerify
-  };
-  
-  static int                   LoggingMode;
-  static uint64_t              ErrorCounter;
-  static int32_t               XmitLoggingCounter;
-  static int32_t               RecvLoggingCounter;
-  static int32_t               CsumLoggingCounter;
-  static int32_t               NormLoggingCounter;
-  static int32_t               ReductionLoggingCounter;
-  static std::vector<uint64_t> XmitLogVector;
-  static std::vector<uint64_t> RecvLogVector;
-  static std::vector<uint64_t> CsumLogVector;
-  static std::vector<double>   NormLogVector;
-  static std::vector<double>   ReductionLogVector;
-  static int ContinueOnFail;
-  static int PrintEntireLog;
-  static int ChecksumComms;
-  static int ChecksumCommsSend;
-  static void SetLoggingModePrint(void);
-  static void SetLoggingModeRecord(void);
-  static void SetLoggingModeVerify(void);
-  static void SetLoggingMode(LoggingMode_t mode);
-  static void NormLog(double value);
-  static void CsumLog(uint64_t csum);
-  static void ReductionLog(double lcl, double glbl);
-  static void Truncate(void);
-  static void ResetCounters(void);
-  static uint64_t ErrorCount(void);
-  static void xmitLog(void *,uint64_t bytes);
-  static void recvLog(void *,uint64_t bytes,int rank);
-};
-NAMESPACE_END(Grid);
-
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -77,10 +77,6 @@ feenableexcept (unsigned int excepts)
 }
 #endif

-#ifndef HOST_NAME_MAX
-#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
-#endif
-
 NAMESPACE_BEGIN(Grid);

 //////////////////////////////////////////////////////
@@ -94,12 +90,7 @@ int GridThread::_threads =1;
 int GridThread::_hyperthreads=1;
 int GridThread::_cores=1;

-char hostname[HOST_NAME_MAX+1];

-char *GridHostname(void)
-{
-  return hostname;
-}
 const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;};
 const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;};
 const Coordinate GridDefaultSimd(int dims,int nsimd)
@@ -292,6 +283,7 @@ void GridBanner(void)
    std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
 #endif
    std::cout << std::endl;
+    std::cout << std::setprecision(9);
 }

 void Grid_init(int *argc,char ***argv)
@@ -402,8 +394,6 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;

-  gethostname(hostname, HOST_NAME_MAX+1);
-  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;

  /////////////////////////////////////////////////////////
  // Reporting
@@ -424,7 +414,7 @@ void Grid_init(int *argc,char ***argv)
  // Logging
  ////////////////////////////////////
  std::vector<std::string> logstreams;
-  std::string defaultLog("Error,Warning,Message,Performance");
+  std::string defaultLog("Error,Warning,Message,Memory");
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);

@@ -548,6 +538,10 @@ void Grid_init(int *argc,char ***argv)

 void Grid_finalize(void)
 {
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<"******* Grid Finalize                ******"<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
 void Grid_init(int *argc,char ***argv);
 void Grid_finalize(void);

-char * GridHostname(void);
-
 // internal, controled with --handle
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_debug_handler_init(void);
@@ -70,6 +68,5 @@ void GridParseLayout(char **argv,int argc,
 void printHash(void);


-
 NAMESPACE_END(Grid);

--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -8,7 +8,7 @@ namespace Grid{
  public:

    template<class coor_t>
-    static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
+    static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
@@ -18,28 +18,45 @@ namespace Grid{
    }

    template<class coor_t>
-    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
+    template<class coor_t>
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoor(coor,index64,dims);
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }

    template<class coor_t>
-    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=nd-1;d>=0;d--){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    template<class coor_t>
-    static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoorReversed(coor,index64,dims);
+      if ( index64>=2*1024*1024*1024LL ){
+	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+      }
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }
+    template<class coor_t>
+    static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=nd-1;d>=0;d--){
--- a/Grid/util/Util.h
+++ b/Grid/util/Util.h
@@ -1,6 +1,6 @@
-#pragma once
+#ifndef GRID_UTIL_H
+#define GRID_UTIL_H
 #include <Grid/util/Coordinate.h>
 #include <Grid/util/Lexicographic.h>
 #include <Grid/util/Init.h>
-#include <Grid/util/FlightRecorder.h>
-
+#endif
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -54,16 +54,15 @@ int main(int argc, char **argv)
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 24;
+  MD.MDsteps = 12;
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 104;
+  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("HotStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("HotStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

@@ -88,7 +87,6 @@ int main(int argc, char **argv)
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
-
  //////////////////////////////////////////////

  const int Ls      = 16;
@@ -136,6 +134,7 @@ int main(int argc, char **argv)
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(2);
+  ActionLevel<HMCWrapper::Field> Level3(4);

  ////////////////////////////////////
  // Strange action
@@ -192,7 +191,7 @@ int main(int argc, char **argv)
  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level1.push_back(&Jacobian);
+  if( ApplySmearing ) Level2.push_back(&Jacobian);
  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;


@@ -201,7 +200,7 @@ int main(int argc, char **argv)
  /////////////////////////////////////////////////////////////
  //  GaugeAction.is_smeared = ApplySmearing;
  GaugeAction.is_smeared = true;
-  Level2.push_back(&GaugeAction);
+  Level3.push_back(&GaugeAction);

  std::cout << GridLogMessage << " ************************************************"<< std::endl;
  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
@@ -211,11 +210,10 @@ int main(int argc, char **argv)


  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
+
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
-
-  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
-  TheHMC.initializeGaugeFieldAndRNGs(U);
+  TheHMC.TheAction.push_back(Level3);

  TheHMC.Run(SmearingPolicy); // for smearing

--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -1,226 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Copyright (C) 2023
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
-#include <Grid/qcd/smearing/JacobianAction.h>
-
-using namespace Grid;
-
-int main(int argc, char **argv)
-{
-  std::cout << std::setprecision(12);
-  
-  Grid_init(&argc, &argv);
-  int threads = GridThread::GetThreads();
-  // here make a routine to print all the relevant information on the run
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
-
-   // Typedefs to simplify notation
-  typedef WilsonImplR FermionImplPolicy;
-  typedef MobiusFermionD FermionAction;
-  typedef typename FermionAction::FermionField FermionField;
-
-  typedef Grid::XmlReader       Serialiser;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
-  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 24;
-  MD.trajL   = 1.0;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
-  HMCparams.Trajectories     = 200;
-  HMCparams.NoMetropolisUntil=  20;
-  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("HotStart");
-  HMCparams.StartingType     =std::string("ColdStart");
-  //  HMCparams.StartingType     =std::string("CheckpointStart");
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_EODWF_lat";
-  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
-  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 1;
-  CPparams.saveSmeared   = true;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-
-  //////////////////////////////////////////////
-
-  const int Ls      = 12;
-  Real beta         = 2.37;
-  Real light_mass   = 0.0047;
-  Real strange_mass = 0.0186;
-  Real pv_mass      = 1.0;
-  RealD M5  = 1.8;
-  RealD b   = 1.0; // Scale factor one, Shamir
-  RealD c   = 0.0;
-
-  OneFlavourRationalParams OFRp;
-  OFRp.lo       = 1.0e-2;
-  OFRp.hi       = 64;
-  OFRp.MaxIter  = 10000;
-  OFRp.tolerance= 1.0e-10;
-  OFRp.degree   = 14;
-  OFRp.precision= 40;
-
-  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
-
-  auto GridPtr   = TheHMC.Resources.GetCartesian();
-  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
-  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
-  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
-
-  IwasakiGaugeActionR GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeField U(GridPtr);
-  LatticeGaugeField Uhot(GridPtr);
-
-  // These lines are unecessary if BC are all periodic
-  std::vector<Complex> boundary = {1,1,1,-1};
-  FermionAction::ImplParams Params(boundary);
-
-  double StoppingCondition = 1e-10;
-  double MaxCGIterations = 30000;
-  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
-
-  bool ApplySmearing = true;
-  
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(2);
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-
-  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 CG,
-	 CG, CG,
-	 CG, CG, 
-	 OFRp, false);
-
-  EOFA.is_smeared = ApplySmearing;
-  Level1.push_back(&EOFA);
-
-  ////////////////////////////////////
-  // up down action
-  ////////////////////////////////////
-  std::vector<Real> light_den;
-  std::vector<Real> light_num;
-
-  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);
-  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]);
-    light_num.push_back(hasenbusch[h]);
-  }
-  light_num.push_back(pv_mass);
-
-  std::vector<FermionAction *> Numerators;
-  std::vector<FermionAction *> Denominators;
-  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
-    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
-    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
-    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
-  }
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    Quotients[h]->is_smeared = ApplySmearing;
-    Level1.push_back(Quotients[h]);
-  }
-
-  /////////////////////////////////////////////////////////////
-  // lnDetJacobianAction
-  /////////////////////////////////////////////////////////////
-  double rho = 0.1;  // smearing parameter
-  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
-  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
-  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
-  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
-  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level1.push_back(&Jacobian);
-  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
-
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  GaugeAction.is_smeared = ApplySmearing;
-  Level2.push_back(&GaugeAction);
-
-  std::cout << GridLogMessage << " ************************************************"<< std::endl;
-  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
-  std::cout << GridLogMessage << " ************************************************"<< std::endl;
-  std::cout << GridLogMessage <<  std::endl;
-  std::cout << GridLogMessage <<  std::endl;
-
-
-  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-
-  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
-  TheHMC.initializeGaugeFieldAndRNGs(U);
-
-  TheHMC.Run(SmearingPolicy); // for smearing
-
-  Grid_finalize();
-} // main
-
-
-
--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -1,226 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Copyright (C) 2023
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
-#include <Grid/qcd/smearing/JacobianAction.h>
-
-using namespace Grid;
-
-int main(int argc, char **argv)
-{
-  std::cout << std::setprecision(12);
-  
-  Grid_init(&argc, &argv);
-  int threads = GridThread::GetThreads();
-  // here make a routine to print all the relevant information on the run
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
-
-   // Typedefs to simplify notation
-  typedef WilsonImplR FermionImplPolicy;
-  typedef MobiusFermionD FermionAction;
-  typedef typename FermionAction::FermionField FermionField;
-
-  typedef Grid::XmlReader       Serialiser;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
-  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 24;
-  MD.trajL   = 1.0;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
-  HMCparams.Trajectories     = 200;
-  HMCparams.NoMetropolisUntil=  20;
-  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("HotStart");
-  HMCparams.StartingType     =std::string("ColdStart");
-  //  HMCparams.StartingType     =std::string("CheckpointStart");
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_EODWF_lat";
-  CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
-  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 1;
-  CPparams.saveSmeared   = true;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-
-  //////////////////////////////////////////////
-
-  const int Ls      = 12;
-  Real beta         = 2.37;
-  Real light_mass   = 0.0047;
-  Real strange_mass = 0.0186;
-  Real pv_mass      = 1.0;
-  RealD M5  = 1.8;
-  RealD b   = 1.0; // Scale factor one, Shamir
-  RealD c   = 0.0;
-
-  OneFlavourRationalParams OFRp;
-  OFRp.lo       = 1.0e-2;
-  OFRp.hi       = 64;
-  OFRp.MaxIter  = 10000;
-  OFRp.tolerance= 1.0e-10;
-  OFRp.degree   = 14;
-  OFRp.precision= 40;
-
-  std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
-
-  auto GridPtr   = TheHMC.Resources.GetCartesian();
-  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
-  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
-  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
-
-  IwasakiGaugeActionR GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeField U(GridPtr);
-  LatticeGaugeField Uhot(GridPtr);
-
-  // These lines are unecessary if BC are all periodic
-  std::vector<Complex> boundary = {1,1,1,-1};
-  FermionAction::ImplParams Params(boundary);
-
-  double StoppingCondition = 1e-10;
-  double MaxCGIterations = 30000;
-  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
-
-  bool ApplySmearing = false;
-  
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(2);
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-
-  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 CG,
-	 CG, CG,
-	 CG, CG, 
-	 OFRp, false);
-
-  EOFA.is_smeared = ApplySmearing;
-  Level1.push_back(&EOFA);
-
-  ////////////////////////////////////
-  // up down action
-  ////////////////////////////////////
-  std::vector<Real> light_den;
-  std::vector<Real> light_num;
-
-  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);
-  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]);
-    light_num.push_back(hasenbusch[h]);
-  }
-  light_num.push_back(pv_mass);
-
-  std::vector<FermionAction *> Numerators;
-  std::vector<FermionAction *> Denominators;
-  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
-    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
-    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
-    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
-  }
-
-  for(int h=0;h<n_hasenbusch+1;h++){
-    Quotients[h]->is_smeared = ApplySmearing;
-    Level1.push_back(Quotients[h]);
-  }
-
-  /////////////////////////////////////////////////////////////
-  // lnDetJacobianAction
-  /////////////////////////////////////////////////////////////
-  double rho = 0.1;  // smearing parameter
-  int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd
-  int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd
-  Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
-  SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
-  JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
-  if( ApplySmearing ) Level1.push_back(&Jacobian);
-  std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
-
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  GaugeAction.is_smeared = ApplySmearing;
-  Level2.push_back(&GaugeAction);
-
-  std::cout << GridLogMessage << " ************************************************"<< std::endl;
-  std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
-  std::cout << GridLogMessage << " ************************************************"<< std::endl;
-  std::cout << GridLogMessage <<  std::endl;
-  std::cout << GridLogMessage <<  std::endl;
-
-
-  std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-
-  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
-  TheHMC.initializeGaugeFieldAndRNGs(U);
-
-  TheHMC.Run(SmearingPolicy); // for smearing
-
-  Grid_finalize();
-} // main
-
-
-
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -1,350 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./tests/Test_hmc_EODWFRatio.cc
-
-Copyright (C) 2015-2016
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-int main(int argc, char **argv) {
-  using namespace Grid;
-
-  Grid_init(&argc, &argv);
-
-  CartesianCommunicator::BarrierWorld();
-  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
-  
-  int threads = GridThread::GetThreads();
-
-   // Typedefs to simplify notation
-  typedef WilsonImplD FermionImplPolicy;
-  typedef MobiusFermionD FermionAction;
-  typedef MobiusEOFAFermionD FermionEOFAAction;
-  typedef typename FermionAction::FermionField FermionField;
-
-  typedef Grid::XmlReader       Serialiser;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
-  //  MD.name    = std::string("Leap Frog");
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  MD.name    = std::string("Force Gradient");
-  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  // MD.name    = std::string("MinimumNorm2");
-  // TrajL = 2
-  // 4/2 => 0.6 dH
-  // 3/3 => 0.8 dH .. depth 3, slower
-  //MD.MDsteps =  4;
-  MD.MDsteps =  3;
-  MD.trajL   = 0.5;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 1077;
-  HMCparams.Trajectories     = 1;
-  HMCparams.NoMetropolisUntil=  0;
-  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_DDHMC_lat";
-  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
-  CPparams.saveInterval  = 1;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-  std::cout << "loaded NERSC checpointer"<<std::endl;
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  //////////////////////////////////////////////
-
-  const int Ls      = 12;
-  RealD M5  = 1.8;
-  RealD b   = 1.5;
-  RealD c   = 0.5;
-  Real beta         = 2.13;
-  //  Real light_mass   = 5.4e-4;
-  Real light_mass     = 7.8e-4;
-  Real light_mass_dir = 0.01;
-  Real strange_mass = 0.0362;
-  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
-  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
-
-  int SP_iters=9000;
-  
-  RationalActionParams OFRp; // Up/down
-  OFRp.lo       = 6.0e-5;
-  OFRp.hi       = 90.0;
-  OFRp.inv_pow  = 2;
-  OFRp.MaxIter  = SP_iters; // get most shifts by 2000, stop sharing space
-  OFRp.action_tolerance= 1.0e-8;
-  OFRp.action_degree   = 18;
-  OFRp.md_tolerance= 1.0e-7;
-  OFRp.md_degree   = 14;
-  //  OFRp.degree   = 20; converges
-  //  OFRp.degree   = 16;
-  OFRp.precision= 80;
-  OFRp.BoundsCheckFreq=0;
-  std::vector<RealD> ActionTolByPole({
-      //      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      3.0e-7,1.0e-7,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8
-    });
-  std::vector<RealD> MDTolByPole({
-      //      1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
-      //      1.0e-6,3.0e-7,1.0e-7,1.0e-7,
-      1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8
-    });
-
-  auto GridPtr   = TheHMC.Resources.GetCartesian();
-  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
-
-  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
-  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
-
-  ////////////////////////////////////////////////////////////////
-  // Domain decomposed
-  ////////////////////////////////////////////////////////////////
-  Coordinate latt4  = GridPtr->GlobalDimensions();
-  Coordinate mpi    = GridPtr->ProcessorGrid();
-  Coordinate shm;
-
-  GlobalSharedMemory::GetShmDims(mpi,shm);
-  
-  Coordinate CommDim(Nd);
-  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-
-  Coordinate NonDirichlet(Nd+1,0);
-  Coordinate Dirichlet(Nd+1,0);
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
-  //Dirichlet[1] = 0;
-  //Dirichlet[2] = 0;
-  //Dirichlet[3] = 0;
-
-  // 
-  Coordinate Block4(Nd);
-  Block4[0] = Dirichlet[1];
-  Block4[1] = Dirichlet[2];
-  Block4[2] = Dirichlet[3];
-  Block4[3] = Dirichlet[4];
-
-  int Width=4;
-  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplD::Field>(Block4,Width));
-
-  //////////////////////////
-  // Fermion Grids
-  //////////////////////////
-  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
-  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
-
-  IwasakiGaugeActionR GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeFieldD  U(GridPtr); U=Zero();
-
-  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
-  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
-  TheHMC.initializeGaugeFieldAndRNGs(U);
-  std::cout << "loaded NERSC gauge field"<<std::endl;
-
-  // These lines are unecessary if BC are all periodic
-  std::vector<Complex> boundary = {1,1,1,-1};
-  FermionAction::ImplParams Params(boundary);
-  FermionAction::ImplParams ParamsDir(boundary);
-
-  Params.dirichlet=NonDirichlet;
-  ParamsDir.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=0;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
-
-  //  double StoppingCondition = 1e-14;
-  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-8;
-  double MDStoppingCondition = 1e-8;
-  double MDStoppingConditionLoose = 1e-8;
-  double MDStoppingConditionStrange = 1e-8;
-  double MaxCGIterations = 300000;
-  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
-  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
-
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(3);
-  ActionLevel<HMCWrapper::Field> Level3(15);
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
-  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
-
-  // Probably dominates the force - back to EOFA.
-  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.1;
-  SFRp.hi       = 25.0;
-  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-8;
-  SFRp.mdtolerance= 2.0e-6;
-  SFRp.degree   = 12;
-  SFRp.precision= 50;
-  
-  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
-  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
-  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
-  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
-  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
-
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCG, ActionCG,
-	 DerivativeCG, DerivativeCG,
-	 SFRp, true);
-  Level2.push_back(&EOFA);
-
-  ////////////////////////////////////
-  // up down action
-  ////////////////////////////////////
-  std::vector<Real> light_den;
-  std::vector<Real> light_num;
-  std::vector<int> dirichlet_den;
-  std::vector<int> dirichlet_num;
-
-  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
-  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
-  }
-
-  for(int h=0;h<n_hasenbusch;h++){
-    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
-  }
-  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
-
-  std::vector<FermionAction *> Numerators;
-  std::vector<FermionAction *> Denominators;
-  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
-  
-  std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
-
-  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
-  std::vector<LinearOperatorD *> LinOpD;
-  
-  for(int h=0;h<n_hasenbusch+1;h++){
-    std::cout << GridLogMessage
-	      << " 2f quotient Action ";
-    std::cout << "det D("<<light_den[h]<<")";
-    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
-    std::cout << "/ det D("<<light_num[h]<<")";
-    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
-    std::cout << std::endl;
-
-    FermionAction::ImplParams ParamsNum(boundary);
-    FermionAction::ImplParams ParamsDen(boundary);
-    
-    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
-    else                      ParamsNum.dirichlet = NonDirichlet;
-
-    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
-    else                      ParamsDen.dirichlet = NonDirichlet;
-
-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
-
-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
-    
-    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
-    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
-
-    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
-
-    double conv  = MDStoppingCondition;
-    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
-    
-    if(h!=0) {
-      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
-    } else {
-      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
-      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
-    }
-  }
-  for(int h=0;h<Bdys.size();h++){
-    Bdys[h]->SetTolerances(ActionTolByPole,MDTolByPole);
-  }
-  int nquo=Quotients.size();
-  Level1.push_back(Bdys[0]);
-  Level1.push_back(Bdys[1]);
-  Level2.push_back(Quotients[0]);
-  for(int h=1;h<nquo-1;h++){
-    Level2.push_back(Quotients[h]);
-  }
-  Level2.push_back(Quotients[nquo-1]);
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  Level3.push_back(&GaugeAction);
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
-  std::cout << GridLogMessage << " Action complete "<< std::endl;
-  /////////////////////////////////////////////////////////////
-
-  TheHMC.Run();  // no smearing
-
-  Grid_finalize();
-} // main
-
-
-
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
@@ -343,7 +343,7 @@ int main(int argc, char **argv) {
  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
  SFRp.lo       = 0.1;
-  SFRp.hi       = 30.0;
+  SFRp.hi       = 25.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-5;
  SFRp.mdtolerance= 2.0e-4;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@@ -128,7 +128,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-#if 0
+#if 1
      RealD delta=1.e-4;
      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
@@ -180,7 +180,7 @@ int main(int argc, char **argv) {
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  12;
+  MD.MDsteps =  14;
  MD.trajL   = 0.5;

  HMCparameters HMCparams;
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
+  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);

@@ -218,14 +218,15 @@ int main(int argc, char **argv) {
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
-  RealD beta         = 2.13;
+  Real beta         = 2.13;
  //  Real light_mass   = 5.4e-4;
  Real light_mass     = 7.8e-4;
-  //  Real light_mass     = 7.8e-3;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
-  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
+  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
+  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
@@ -276,20 +277,20 @@ int main(int argc, char **argv) {

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-14;
-  double MDStoppingCondition = 1e-9;
-  double MDStoppingConditionLoose = 1e-9;
-  double MDStoppingConditionStrange = 1e-9;
-  double MaxCGIterations = 50000;
+  double StoppingCondition = 1e-9;
+  double MDStoppingCondition = 1e-8;
+  double MDStoppingConditionLoose = 1e-8;
+  double MDStoppingConditionStrange = 1e-8;
+  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);

  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(2);
-  ActionLevel<HMCWrapper::Field> Level3(4);
+  //  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(1);
+  ActionLevel<HMCWrapper::Field> Level3(15);

  ////////////////////////////////////
  // Strange action
@@ -299,11 +300,11 @@ int main(int argc, char **argv) {

  // Probably dominates the force - back to EOFA.
  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.8;
+  SFRp.lo       = 0.1;
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-12;
-  SFRp.mdtolerance= 1.0e-9;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 2.0e-6;
  SFRp.degree   = 10;
  SFRp.precision= 50;
  
@@ -354,10 +355,8 @@ int main(int argc, char **argv) {
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
-	 //	 ActionCGL, ActionCGR,
-	 //	 DerivativeCGL, DerivativeCGR,
-	 ActionCG, ActionCG,
-	 DerivativeCG, DerivativeCG,
+	 ActionCGL, ActionCGR,
+	 DerivativeCGL, DerivativeCGR,
 	 SFRp, true);
  Level2.push_back(&EOFA);

@@ -444,14 +443,13 @@ int main(int argc, char **argv) {
  }
  int nquo=Quotients.size();
  for(int h=0;h<nquo;h++){
-    Level1.push_back(Quotients[h]);
+    Level2.push_back(Quotients[h]);
  }

  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
-  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
--- a/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
@@ -1,268 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./tests/Test_hmc_EODWFRatio.cc
-
-Copyright (C) 2015-2016
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-
-
-int main(int argc, char **argv) {
-  using namespace Grid;
-
-  std::cout << " Grid Initialise "<<std::endl;
-  
-  Grid_init(&argc, &argv);
-
-  CartesianCommunicator::BarrierWorld();
-  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
-  
-  int threads = GridThread::GetThreads();
-
-   // Typedefs to simplify notation
-  typedef WilsonImplD FermionImplPolicy;
-  typedef MobiusFermionD FermionAction;
-  typedef MobiusEOFAFermionD FermionEOFAAction;
-  typedef typename FermionAction::FermionField FermionField;
-
-  typedef WilsonImplF FermionImplPolicyF;
-  typedef MobiusFermionF FermionActionF;
-  typedef MobiusEOFAFermionF FermionEOFAActionF;
-  typedef typename FermionActionF::FermionField FermionFieldF;
-
-  typedef Grid::XmlReader       Serialiser;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
-  //  MD.name    = std::string("Leap Frog");
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
-  MD.name    = std::string("Force Gradient");
-  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  //  MD.name    = std::string("MinimumNorm2");
-  // TrajL = 2
-  // 4/2 => 0.6 dH
-  // 3/3 => 0.8 dH .. depth 3, slower
-  //MD.MDsteps =  4;
-  MD.MDsteps =  8;
-  MD.trajL   = 0.5;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 1077;
-  HMCparams.Trajectories     = 20;
-  HMCparams.NoMetropolisUntil=  0;
-  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("ColdStart");
-  //  HMCparams.StartingType     =std::string("CheckpointStart");
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_HMC_lat";
-  CPparams.rng_prefix    = "ckpoint_HMC_rng";
-  CPparams.saveInterval  = 1;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-  std::cout << "loaded NERSC checpointer"<<std::endl;
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  // Construct observables
-  // here there is too much indirection
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  //////////////////////////////////////////////
-
-  const int Ls      = 12;
-  RealD M5  = 1.8;
-  RealD b   = 1.5;
-  RealD c   = 0.5;
-  RealD beta         = 2.13;
-  //  Real light_mass   = 5.4e-4;
-  Real light_mass     = 7.8e-4;
-  //  Real light_mass     = 7.8e-3;
-  Real strange_mass = 0.0362;
-  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
-  //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
-
-  auto GridPtr   = TheHMC.Resources.GetCartesian();
-  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
-
-  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
-  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
-
-  ////////////////////////////////////////////////////////////////
-  // Domain decomposed
-  ////////////////////////////////////////////////////////////////
-  Coordinate latt4  = GridPtr->GlobalDimensions();
-  Coordinate mpi    = GridPtr->ProcessorGrid();
-  Coordinate shm;
-
-  GlobalSharedMemory::GetShmDims(mpi,shm);
-
-  //////////////////////////
-  // Fermion Grids
-  //////////////////////////
-  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
-  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
-
-  IwasakiGaugeActionR GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeFieldD  U(GridPtr); U=Zero();
-
-  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
-  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
-  TheHMC.initializeGaugeFieldAndRNGs(U);
-  std::cout << "loaded NERSC gauge field"<<std::endl;
-
-  // These lines are unecessary if BC are all periodic
-  std::vector<Complex> boundary = {1,1,1,-1};
-  FermionAction::ImplParams Params(boundary);
-
-  //  double StoppingCondition = 1e-14;
-  //  double MDStoppingCondition = 1e-9;
-  double StoppingCondition = 1e-14;
-  double MDStoppingCondition = 1e-9;
-  double MDStoppingConditionLoose = 1e-9;
-  double MDStoppingConditionStrange = 1e-9;
-  double MaxCGIterations = 50000;
-  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
-  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
-
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(2);
-  ActionLevel<HMCWrapper::Field> Level3(4);
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
-  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
-
-  // Probably dominates the force - back to EOFA.
-  OneFlavourRationalParams SFRp;
-  SFRp.lo       = 0.8;
-  SFRp.hi       = 30.0;
-  SFRp.MaxIter  = 10000;
-  SFRp.tolerance= 1.0e-12;
-  SFRp.mdtolerance= 1.0e-9;
-  SFRp.degree   = 10;
-  SFRp.precision= 50;
-  
-  MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
-  MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
-  ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations);
-  ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations);
-  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
-  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
-
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCG, ActionCG,
-	 DerivativeCG, DerivativeCG,
-	 SFRp, true);
-  Level2.push_back(&EOFA);
-
-  ////////////////////////////////////
-  // up down action
-  ////////////////////////////////////
-  std::vector<Real> light_den;
-  std::vector<Real> light_num;
-
-  int n_hasenbusch = hasenbusch.size();
-  light_den.push_back(light_mass); 
-  for(int h=0;h<n_hasenbusch;h++){
-    light_den.push_back(hasenbusch[h]);
-  }
-
-  for(int h=0;h<n_hasenbusch;h++){
-    light_num.push_back(hasenbusch[h]);
-  }
-  light_num.push_back(pv_mass);
-
-  std::vector<FermionAction *> Numerators;
-  std::vector<FermionAction *> Denominators;
-  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
-  
-  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
-
-  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
-  std::vector<LinearOperatorD *> LinOpD;
-  
-  for(int h=0;h<n_hasenbusch+1;h++){
-    std::cout << GridLogMessage
-	      << " 2f quotient Action ";
-    std::cout << "det D("<<light_den[h]<<")";
-    std::cout << "/ det D("<<light_num[h]<<")";
-    std::cout << std::endl;
-
-    FermionAction::ImplParams ParamsNum(boundary);
-    FermionAction::ImplParams ParamsDen(boundary);
-    
-    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
-    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
-
-    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
-
-    double conv  = MDStoppingCondition;
-    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
-    
-    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG,CG));
-  }
-  int nquo=Quotients.size();
-  for(int h=0;h<nquo;h++){
-    Level1.push_back(Quotients[h]);
-  }
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  Level3.push_back(&GaugeAction);
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
-  std::cout << GridLogMessage << " Action complete "<< std::endl;
-  /////////////////////////////////////////////////////////////
-
-  TheHMC.Run();  // no smearing
-
-  Grid_finalize();
-} // main
-
-
-
--- a/MPI_benchmark/bench2.pbs
+++ b/MPI_benchmark/bench2.pbs
@@ -1,22 +0,0 @@
-#!/bin/bash
-#PBS -q EarlyAppAccess
-#PBS -l select=2
-#PBS -l walltime=01:00:00
-#PBS -A LatticeQCD_aesp_CNDA
-
-export TZ='/usr/share/zoneinfo/US/Central'
-export OMP_PROC_BIND=spread
-export OMP_NUM_THREADS=3
-unset OMP_PLACES
-
-cd $PBS_O_WORKDIR
-
-NNODES=`wc -l < $PBS_NODEFILE`
-NRANKS=12         # Number of MPI ranks per node
-NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
-NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
-
-NTOTRANKS=$(( NNODES * NRANKS ))
-
-CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
-$CMD
--- a/MPI_benchmark/compile-command
+++ b/MPI_benchmark/compile-command
@@ -1 +0,0 @@
-mpicxx  -fsycl halo_mpi.cc -o halo_mpi
--- a/MPI_benchmark/gpu_tile_compact.sh
+++ b/MPI_benchmark/gpu_tile_compact.sh
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
-export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
-export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
-export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
-
-export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
-export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
-export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
-export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
-  
-
-export ZE_AFFINITY_MASK=$gpu_id.$tile_id
-export ONEAPI_DEVICE_FILTER=gpu,level_zero
-
-#unset EnableWalkerPartition
-#export EnableImplicitScaling=0
-#export GRID_MPICH_NIC_BIND=$NIC
-#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
-#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
-#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
-
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
-
-numactl -m $PNUMA -N $NUMA  "$@"
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@@ -1,333 +0,0 @@
-#include <cassert>
-#include <complex>
-#include <memory>
-#include <vector>
-#include <algorithm>
-#include <array>
-#include <string>
-#include <stdio.h>
-#include <stdlib.h>
-#include <strings.h>
-#include <ctime>
-#include <sys/time.h>
-
-#include <mpi.h>
-
-/**************************************************************
- * GPU - GPU memory cartesian halo exchange benchmark
- * Config: what is the target
- **************************************************************
- */
-#undef ACC_CUDA
-#undef  ACC_HIP
-#define  ACC_SYCL
-#undef  ACC_NONE
-
-/**************************************************************
- * Some MPI globals
- **************************************************************
- */
-MPI_Comm WorldComm;
-MPI_Comm WorldShmComm;
-
-int WorldSize;
-int WorldRank;
-
-int WorldShmSize;
-int WorldShmRank;
-
-/**************************************************************
- * Allocate buffers on the GPU, SYCL needs an init call and context
- **************************************************************
- */
-#ifdef ACC_CUDA
-#include <cuda.h>
-void acceleratorInit(void){}
-void *acceleratorAllocDevice(size_t bytes)
-{
-  void *ptr=NULL;
-  auto err = cudaMalloc((void **)&ptr,bytes);
-  assert(err==cudaSuccess);
-  return ptr;
-}
-void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
-#endif
-#ifdef ACC_HIP
-#include <hip/hip_runtime.h>
-void acceleratorInit(void){}
-inline void *acceleratorAllocDevice(size_t bytes)
-{
-  void *ptr=NULL;
-  auto err = hipMalloc((void **)&ptr,bytes);
-  if( err != hipSuccess ) {
-    ptr = (void *) NULL;
-    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
-  }
-  return ptr;
-};
-inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
-#endif
-#ifdef ACC_SYCL
-#include <sycl/CL/sycl.hpp>
-#include <sycl/usm.hpp>
-cl::sycl::queue *theAccelerator;
-void acceleratorInit(void)
-{
-  int nDevices = 1;
-#if 1
-  cl::sycl::gpu_selector selector;
-  cl::sycl::device selectedDevice { selector };
-  theAccelerator = new sycl::queue (selectedDevice);
-#else
-  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
-  theAccelerator = new sycl::queue (selectedDevice);
-#endif
-  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
-  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
-}
-inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
-inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
-#endif
-#ifdef ACC_NONE
-void acceleratorInit(void){}
-inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
-inline void acceleratorFreeDevice(void *ptr){free(ptr);};
-#endif
-
-
-/**************************************************************
- * Microsecond timer
- **************************************************************
- */
-inline double usecond(void) {
-  struct timeval tv;
-  gettimeofday(&tv,NULL);
-  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
-}
-/**************************************************************
- * Main benchmark routine
- **************************************************************
- */
-void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
-{
-  int64_t words = 3*4*2;
-  int64_t face,vol;
-  int Nd=cart_geom.size();
-  
-  /**************************************************************
-   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
-   * Allocate memory for these
-   **************************************************************
-   */
-  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
-  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
-
-  
-  std::vector<void *> send_bufs;
-  std::vector<void *> recv_bufs;
-  size_t vw = face*words;
-  size_t bytes = face*words*sizeof(double);
-
-  if ( use_device ) {
-    for(int d=0;d<2*Nd;d++){
-      send_bufs.push_back(acceleratorAllocDevice(bytes));
-      recv_bufs.push_back(acceleratorAllocDevice(bytes));
-    }
-  } else {
-    for(int d=0;d<2*Nd;d++){
-      send_bufs.push_back(malloc(bytes));
-      recv_bufs.push_back(malloc(bytes));
-    }
-  }
-  /*********************************************************
-   * Build cartesian communicator
-   *********************************************************
-   */
-  int ierr;
-  int rank;
-  std::vector<int> coor(Nd);
-  MPI_Comm communicator;
-  std::vector<int> periodic(Nd,1);
-  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
-  MPI_Comm_rank(communicator,&rank);
-  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
-
-  static int reported;
-  if ( ! reported ) { 
-    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
-	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
-    reported =1 ;
-  }
-  /*********************************************************
-   * Perform halo exchanges
-   *********************************************************
-   */
-  for(int d=0;d<Nd;d++){
-    if ( cart_geom[d]>1 ) {
-      double t0=usecond();
-
-      int from,to;
-      
-      MPI_Barrier(communicator);
-      for(int n=0;n<ncall;n++){
-	
-	void *xmit = (void *)send_bufs[d];
-	void *recv = (void *)recv_bufs[d];
-	
-	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
-	assert(ierr==0);
-	
-	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
-			  recv,bytes,MPI_CHAR,from, from,
-			  communicator,MPI_STATUS_IGNORE);
-	assert(ierr==0);
-	
-	xmit = (void *)send_bufs[Nd+d];
-	recv = (void *)recv_bufs[Nd+d];
-	
-	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
-	assert(ierr==0);
-	
-	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
-			  recv,bytes,MPI_CHAR,from, from,
-			  communicator,MPI_STATUS_IGNORE);
-	assert(ierr==0);
-      }
-      MPI_Barrier(communicator);
-
-      double t1=usecond();
-      
-      double dbytes    = bytes*WorldShmSize;
-      double xbytes    = dbytes*2.0*ncall;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
-
-      if ( ! WorldRank ) {
-	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
-      }
-    }
-  }
-  /*********************************************************
-   * Free memory
-   *********************************************************
-   */
-  if ( use_device ) {
-    for(int d=0;d<2*Nd;d++){
-      acceleratorFreeDevice(send_bufs[d]);
-      acceleratorFreeDevice(recv_bufs[d]);
-    }
-  } else {
-    for(int d=0;d<2*Nd;d++){
-      free(send_bufs[d]);
-      free(recv_bufs[d]);
-    }
-  }
-
-}
-
-/**************************************
- * Command line junk
- **************************************/
-
-std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
-{
-  char ** itr = std::find(begin, end, option);
-  if (itr != end && ++itr != end) {
-    std::string payload(*itr);
-    return payload;
-  }
-  return std::string("");
-}
-bool CmdOptionExists(char** begin, char** end, const std::string& option)
-{
-  return std::find(begin, end, option) != end;
-}
-void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
-{
-  vec.resize(0);
-  std::stringstream ss(str);
-  int i;
-  while (ss >> i){
-    vec.push_back(i);
-    if(std::ispunct(ss.peek()))
-      ss.ignore();
-  }
-  return;
-}
-/**************************************
- * Command line junk
- **************************************/
-int main(int argc, char **argv)
-{
-  std::string arg;
-
-  acceleratorInit();
-
-  MPI_Init(&argc,&argv);
-
-  WorldComm = MPI_COMM_WORLD;
-  
-  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
-
-  MPI_Comm_rank(WorldComm     ,&WorldRank);
-  MPI_Comm_size(WorldComm     ,&WorldSize);
-
-  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
-  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
-
-  if ( WorldSize/WorldShmSize > 2) {
-    printf("This benchmark is meant to run on at most two nodes only\n");
-  }
-
-  auto mpi =std::vector<int>({1,1,1,1});
-
-  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
-    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
-    CmdOptionIntVector(arg,mpi);
-  } else {
-    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
-    exit(0);
-  }
-
-  if( !WorldRank ) {
-    printf("***********************************\n");
-    printf("%d ranks\n",WorldSize); 
-    printf("%d ranks-per-node\n",WorldShmSize);
-    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
-    printf("Cartesian layout: ");
-    for(int d=0;d<mpi.size();d++){
-      printf("%d ",mpi[d]);
-    }
-    printf("\n");fflush(stdout);
-    printf("***********************************\n");
-  }
-
-  
-  if( !WorldRank ) {
-    printf("=========================================================\n");
-    printf("= Benchmarking HOST memory MPI performance               \n");
-    printf("=========================================================\n");fflush(stdout);
-    printf("= L\t pkt bytes\t MB/s           \n");
-    printf("=========================================================\n");fflush(stdout);
-  }
-
-  for(int L=16;L<=64;L+=4){
-    Benchmark(L,mpi,false,100);
-  }  
-
-  if( !WorldRank ) {
-    printf("=========================================================\n");
-    printf("= Benchmarking DEVICE memory MPI performance             \n");
-    printf("=========================================================\n");fflush(stdout);
-  }
-  for(int L=16;L<=64;L+=4){
-    Benchmark(L,mpi,true,100);
-  }  
-
-  if( !WorldRank ) {
-    printf("=========================================================\n");
-    printf("= DONE   \n");
-    printf("=========================================================\n");
-  }
-  MPI_Finalize();
-}
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -90,11 +90,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];

  Benchmark(Ls,Dirichlet);

@@ -105,11 +105,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  
  Benchmark(Ls,Dirichlet);

@@ -185,7 +185,6 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  GaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
-  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;

@@ -308,14 +307,6 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
-      std::cout<<GridLogMessage << "RESULT" << std::endl;
-      //      std::cout << result<<std::endl;
-      std::cout << norm2(result)<<std::endl;
-      std::cout<<GridLogMessage << "REF" << std::endl;
-      std::cout << norm2(ref)<<std::endl;
-      std::cout<<GridLogMessage << "ERR" << std::endl;
-      std::cout << norm2(err)<<std::endl;
-      FGrid->Barrier();
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -1,968 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./benchmarks/Benchmark_usqcd.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <Grid/algorithms/blas/BatchedBlas.h>
-
-using namespace Grid;
-
-std::vector<int> L_list;
-std::vector<int> Ls_list;
-std::vector<double> mflop_list;
-
-double mflop_ref;
-double mflop_ref_err;
-
-int NN_global;
-
-FILE * FP;
-
-struct time_statistics{
-  double mean;
-  double err;
-  double min;
-  double max;
-
-  void statistics(std::vector<double> v){
-      double sum = std::accumulate(v.begin(), v.end(), 0.0);
-      mean = sum / v.size();
-
-      std::vector<double> diff(v.size());
-      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
-      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
-      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
-
-      auto result = std::minmax_element(v.begin(), v.end());
-      min = *result.first;
-      max = *result.second;
-}
-};
-
-void comms_header(){
-  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
-            <<"bytes\t MB/s uni  \t\t MB/s bidi "<<std::endl;
-};
-
-struct controls {
-  int Opt;
-  int CommsOverlap;
-  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
-};
-
-class Benchmark {
-public:
-  static void Decomposition (void ) {
-
-    int threads = GridThread::GetThreads();
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
-    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
-    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
-    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-  }
-
-  static void Comms(void)
-  {
-    int Nloop=200;
-    int nmu=0;
-    int maxlat=32;
-
-    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
-    Coordinate mpi_layout  = GridDefaultMpi();
-
-    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
-
-    std::vector<double> t_time(Nloop);
-    time_statistics timestat;
-
-    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
-    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-    comms_header();
-
-    fprintf(FP,"Communications\n\n");
-    fprintf(FP,"Packet bytes, direction, GB/s per node\n");
-    for(int lat=16;lat<=maxlat;lat+=8){
-      //      for(int Ls=8;Ls<=8;Ls*=2){
-      { int Ls=12;
-
-	Coordinate latt_size  ({lat*mpi_layout[0],
-	      lat*mpi_layout[1],
-	      lat*mpi_layout[2],
-	      lat*mpi_layout[3]});
-
-	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-	RealD Nrank = Grid._Nprocessors;
-	RealD Nnode = Grid.NodeCount();
-	RealD ppn = Nrank/Nnode;
-
-	std::vector<HalfSpinColourVectorD *> xbuf(8);
-	std::vector<HalfSpinColourVectorD *> rbuf(8);
-	//Grid.ShmBufferFreeAll();
-	uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-	for(int d=0;d<8;d++){
-	  xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-	  rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	}
-
-	//	int ncomm;
-	double dbytes;
-
-        for(int dir=0;dir<8;dir++) {
-	  int mu =dir % 4;
-	  if (mpi_layout[mu]>1 ) {
-
-	    std::vector<double> times(Nloop);
-	    for(int i=0;i<Nloop;i++){
-
-	      dbytes=0;	        
-	      double start=usecond();
-	      int xmit_to_rank;
-	      int recv_from_rank;
-
-	      if ( dir == mu ) { 
-		int comm_proc=1;
-		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      } else { 
-		int comm_proc = mpi_layout[mu]-1;
-		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      }
-	      Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-				  (void *)&rbuf[dir][0], recv_from_rank,
-				  bytes);
-	      dbytes+=bytes;
-	     
-	      double stop=usecond();
-	      t_time[i] = stop-start; // microseconds
-
-	    }
-	    timestat.statistics(t_time);
-	  
-	    dbytes=dbytes*ppn;
-	    double xbytes    = dbytes*0.5;
-	    double bidibytes = dbytes;
-	  
-	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
-		     << bytes << " \t "
-		     <<xbytes/timestat.mean
-		     << "\t\t"
-		     << bidibytes/timestat.mean<< std::endl;
-	    fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
-	  }
-	}
-	for(int d=0;d<8;d++){
-	  acceleratorFreeDevice(xbuf[d]);
-	  acceleratorFreeDevice(rbuf[d]);
-	}
-      }
-    }
-    fprintf(FP,"\n\n");
-    
-    return;
-  }
-
-  
-  static void Memory(void)
-  {
-    const int Nvec=8;
-    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
-    typedef iVector<vReal,Nvec> Vec;
-
-    Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
-    Coordinate mpi_layout  = GridDefaultMpi();
-
-    fprintf(FP,"Memory Bandwidth\n\n");
-    fprintf(FP,"Bytes, GB/s per node\n");
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  
-    //    uint64_t NP;
-    uint64_t NN;
-
-
-  uint64_t lmax=40;
-#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
-
-    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-    for(int lat=8;lat<=lmax;lat+=8){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-
-      //      NP= Grid.RankCount();
-      NN =Grid.NodeCount();
-
-      Vec rn ; random(sRNG,rn);
-
-      LatticeVec z(&Grid); z=Zero();
-      LatticeVec x(&Grid); x=Zero();
-      LatticeVec y(&Grid); y=Zero();
-      double a=2.0;
-
-      uint64_t Nloop=NLOOP;
-
-      double start=usecond();
-      for(int i=0;i<Nloop;i++){
-	z=a*x-y;
-      }
-      double stop=usecond();
-      double time = (stop-start)/Nloop*1000;
-     
-      double flops=vol*Nvec*2;// mul,add
-      double bytes=3.0*vol*Nvec*sizeof(Real);
-      std::cout<<GridLogMessage<<std::setprecision(3) 
-	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
-	       << "\t\t"<< bytes/time/NN <<std::endl;
-
-      fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
-
-    }
-    fprintf(FP,"\n\n");
-  };
-
-
-  static void BLAS(void)
-  {
-    //int nbasis, int nrhs, int coarseVol
-    int  basis[] = { 16,32,64 };
-    int  rhs[]   = { 8,16,32 };
-    int  vol  = 4*4*4*4;
-
-    GridBLAS blas;
-    
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  
-    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
-
-    for(int b=0;b<3;b++){
-    for(int r=0;r<3;r++){
-      int M=basis[b];
-      int N=rhs[r];
-      int K=basis[b];
-      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
-
-      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
-      
-      std::cout<<GridLogMessage<<std::setprecision(3) 
-	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
-    }}
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    for(int b=0;b<3;b++){
-    for(int r=0;r<3;r++){
-      int M=basis[b];
-      int N=rhs[r];
-      int K=vol;
-      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
-
-      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
-      std::cout<<GridLogMessage<<std::setprecision(3) 
-	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
-    }}
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    for(int b=0;b<3;b++){
-    for(int r=0;r<3;r++){
-      int M=rhs[r];
-      int N=vol;
-      int K=basis[b];
-      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
-
-      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
-      std::cout<<GridLogMessage<<std::setprecision(3) 
-	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
-    }}
-    fprintf(FP,"\n\n\n");
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  };
-  
-
-  static void SU4(void)
-  {
-    const int Nc4=4;
-    typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
-
-    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
-    Coordinate mpi_layout  = GridDefaultMpi();
-    
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
-    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  
-    uint64_t NN;
-
-
-    uint64_t lmax=32;
-
-    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-    for(int lat=8;lat<=lmax;lat+=8){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-
-      NN =Grid.NodeCount();
-
-
-      LatticeSU4 z(&Grid); z=Zero();
-      LatticeSU4 x(&Grid); x=Zero();
-      LatticeSU4 y(&Grid); y=Zero();
-      //      double a=2.0;
-
-      uint64_t Nloop=NLOOP;
-
-      double start=usecond();
-      for(int i=0;i<Nloop;i++){
-	z=x*y;
-      }
-      double stop=usecond();
-      double time = (stop-start)/Nloop*1000;
-     
-      double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
-      double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
-      std::cout<<GridLogMessage<<std::setprecision(3) 
-	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
-	       << "\t\t"<< bytes/time/NN <<std::endl;
-
-    }
-  };
-
-
-  static double DWF(int Ls,int L)
-  {
-    RealD mass=0.1;
-    RealD M5  =1.8;
-
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
-
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),
-								       GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
-    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
-    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-    
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    std::vector<int> seeds5({5,6,7,8});
-    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    typedef DomainWallFermionF Action;
-    typedef typename Action::FermionField Fermion;
-    typedef LatticeGaugeFieldF Gauge;
-    
-    ///////// Source preparation ////////////
-    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
-    Fermion src   (FGrid); random(RNG5,src);
-    Fermion src_e (FrbGrid);
-    Fermion src_o (FrbGrid);
-    Fermion r_e   (FrbGrid);
-    Fermion r_o   (FrbGrid);
-    Fermion r_eo  (FGrid);
-    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-    {
-
-      pickCheckerboard(Even,src_e,src);
-      pickCheckerboard(Odd,src_o,src);
-
-#ifdef AVX512
-      const int num_cases = 3;
-#else 
-      const int num_cases = 2;
-#endif      
-      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
-
-      controls Cases [] = {
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-	
-	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
-	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-	int nwarm = 10;
-	double t0=usecond();
-	FGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
-	}
-	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
-
-	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	FGrid->Barrier();
-	
-	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-
-	// Nc=3 gives
-	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
-	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
-	//	double flops=(1344.0*volume)/2;
-	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2;
-
-	double flops=(fps*volume)/2;
-	double mf_hi, mf_lo, mf_err;
-
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-
-	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
-
-      }
-
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage ;
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    }
-    return mflops_best;
-  }
-
-
-  static double Staggered(int L)
-  {
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
-    
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),
-								       GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
-    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
-    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
-    
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    RealD mass=0.1;
-    RealD c1=9.0/8.0;
-    RealD c2=-1.0/24.0;
-    RealD u0=1.0;
-
-    typedef ImprovedStaggeredFermionF Action;
-    typedef typename Action::FermionField Fermion; 
-    typedef LatticeGaugeFieldF Gauge;
-    
-    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
-
-    typename Action::ImplParams params;
-    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
-
-    ///////// Source preparation ////////////
-    Fermion src   (FGrid); random(RNG4,src);
-    Fermion src_e (FrbGrid);
-    Fermion src_o (FrbGrid);
-    Fermion r_e   (FrbGrid);
-    Fermion r_o   (FrbGrid);
-    Fermion r_eo  (FGrid);
-  
-    {
-
-      pickCheckerboard(Even,src_e,src);
-      pickCheckerboard(Odd,src_o,src);
-    
-      const int num_cases = 2;
-      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
-      
-      controls Cases [] = {
-	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
-	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
-	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-	
-	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
-	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-      
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
-	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	
-	int nwarm = 10;
-	double t0=usecond();
-	FGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
-	}
-	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
-
-	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	FGrid->Barrier();
-	
-	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1146.0*volume)/2;
-	double mf_hi, mf_lo, mf_err;
-	
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
-      
-      }
-
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage ;
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-    }
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    return mflops_best;
-  }
-
-  static double Clover(int L)
-  {
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
-    
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),
-								       GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark Clover on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
-    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
-    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
-    
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    RealD mass=0.1;
-    RealD csw=1.0;
-
-    typedef WilsonCloverFermionF Action;
-    typedef typename Action::FermionField Fermion; 
-    typedef LatticeGaugeFieldF Gauge;
-    
-    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
-
-    Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw);
-
-    ///////// Source preparation ////////////
-    Fermion src   (FGrid); random(RNG4,src);
-    Fermion r     (FGrid);
-  
-    {
-
-      const int num_cases = 1;
-      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
-      
-      controls Cases [] = {
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-	
-	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-      
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	
-	int nwarm = 10;
-	double t0=usecond();
-	FGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-	  Dc.M(src,r);
-	}
-	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
-
-	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	FGrid->Barrier();
-	
-	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344+ 24+6*6*8*2)*volume;
-	double mf_hi, mf_lo, mf_err;
-	
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
-      
-      }
-
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage ;
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-    }
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    return mflops_best;
-  }
-};
-
-
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  if (GlobalSharedMemory::WorldRank==0) { 
-    FP = fopen("Benchmark_usqcd.csv","w");
-  } else {
-    FP = fopen("/dev/null","w");
-  }
-
-  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-
-  Benchmark::Decomposition();
-
-  int do_su4=0;
-  int do_memory=1;
-  int do_comms =1;
-  int do_blas  =1;
-
-  int sel=4;
-  std::vector<int> L_list({8,12,16,24,32});
-  int selm1=sel-1;
-
-  std::vector<double> clover;
-  std::vector<double> dwf4;
-  std::vector<double> staggered;
-
-  int Ls=1;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    clover.push_back(Benchmark::DWF(1,L_list[l]));
-  }
-
-  Ls=12;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    double result = Benchmark::DWF(Ls,L_list[l]) ;
-    dwf4.push_back(result);
-  }
-
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    double result = Benchmark::Staggered(L_list[l]) ;
-    staggered.push_back(result);
-  }
-
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
-  }
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-  int NN=NN_global;
-  if ( do_memory ) {
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::Memory();
-  }
-
-  if ( do_blas ) {
-#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::BLAS();
-#endif
-  }
-
-  if ( do_su4 ) {
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::SU4();
-  }
-  
-  if ( do_comms ) {
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::Comms();
-  }
-
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
-    fprintf(FP,"Per node summary table\n");
-    fprintf(FP,"\n");
-    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
-    fprintf(FP,"\n");
-    for(int l=0;l<L_list.size();l++){
-      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
-      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
-    }
-    fprintf(FP,"\n");
-
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
-    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
-    std::cout<<std::setprecision(3);
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-  Grid_finalize();
-  fclose(FP);
-}
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 set -e

-EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
-EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'
+EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
+EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'


 echo "-- deploying Eigen source..."
-ARC=$(basename ${EIGEN_URL})
+ARC=`basename ${EIGEN_URL}`
 wget ${EIGEN_URL} --no-check-certificate
 if command -v sha256sum; then
   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
@@ -14,8 +14,13 @@ if command -v sha256sum; then
 else
   echo "WARNING: could not verify checksum, please install sha256sum" >&2
 fi
-./scripts/update_eigen.sh "${ARC}"
-rm "${ARC}"
+./scripts/update_eigen.sh ${ARC}
+rm ${ARC}
+# patch for non-portable includes in Eigen 3.3.5
+# apparently already fixed in Eigen HEAD so it should not be 
+# a problem in the future (A.P.)
+patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
+
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
--- a/configure.ac
+++ b/configure.ac
@@ -226,14 +226,23 @@ case ${ac_SFW_FP16} in
 esac

 ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
-AC_ARG_ENABLE([accelerator-aware-mpi],
-    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
-    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
+AC_ARG_ENABLE([accelerator-cshift],
+    [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
+    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])

-case ${ac_ACCELERATOR_AWARE_MPI} in
+AC_ARG_ENABLE([ucx-buggy],
+    [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
+    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
+
+case ${ac_UCXBUGGY} in
    yes)
-      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
-      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
+    ac_ACC_CSHIFT=no;;
+    *);;
+esac
+
+case ${ac_ACC_CSHIFT} in
+    yes)
+      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
    *);;
 esac

--- a/examples/Example_plaquette.cc
+++ b/examples/Example_plaquette.cc
@@ -1,183 +0,0 @@
-/* 
- * Example_plaquette.cc                                                               
- * 
- * D. Clarke 
- * 
- * Here I just want to create an incredibly simple main to get started with GRID and get used
- * to its syntax. If the reader is like me, they vaguely understand something about lattice coding,
- * they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID.
- *
- * Once you've made a new executable, like this one, you can bootstrap.sh again. At this point,
- * the code should be able to find your new executable. You can tell that bootstrap.sh worked by
- * having a look at Make.inc. You should see your executable inside there.
- *
- * Warning: This code illustrative only, not well tested, and not meant for production use. The best
- * way to read this code is to start at the main.
- * 
- */
-
-
-// All your mains should have this
-#include <Grid/Grid.h>
-using namespace Grid;
-
-
-// This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in
-// detail what everything does so we can see how GRID works.
-template <class Gimpl> class WLoops : public Gimpl {
-public:
-    // Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are
-    // already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that
-    // implement this equivalence at compile time.
-    INHERIT_GIMPL_TYPES(Gimpl);
-
-    // Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built
-    // out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base
-    // field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which
-    // are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names
-    // indicates that they should be for internal use only. It seems like these base types have the
-    // acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things
-    // appears to be controlled through a template parameter called vtype.
-
-    // The general math/physics objects, such as a color matrix, are built up by nesting these objects.
-    // For instance a general color matrix has two color indices, so it's built up like
-    //     iScalar<iScalar<iMatrix<vtype ...
-    // where the levels going from the inside out are color, spin, then Lorentz indices. Scalars have
-    // no indices, so it's what we use when such an index isn't needed. Lattice objects are made by one
-    // higher level of indexing using iVector.
-
-    // These types will be used for U and U_mu objects, respectively.
-    typedef typename Gimpl::GaugeLinkField GaugeMat;
-    typedef typename Gimpl::GaugeField GaugeLorentz;
-
-    // U_mu_nu(x)
-    static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
-        // Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together
-        // the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward
-        // has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link. 
-        plaq = Gimpl::CovShiftForward(U[mu],mu,
-                   // Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order.
-                   Gimpl::CovShiftForward(U[nu],nu,
-                       Gimpl::CovShiftBackward(U[mu],mu,
-                           // This means Cshift(adj(Link), mu, -1)
-                           Gimpl::CovShiftIdentityBackward(U[nu], nu))));
-    }
-
-    // tr U_mu_nu(x)
-    static void traceDirPlaquette(ComplexField &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
-        // This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument
-        // to instantiate a Lattice object.
-        GaugeMat sp(U[0].Grid());
-        dirPlaquette(sp, U, mu, nu);
-        plaq = trace(sp);
-    }
-
-    // sum_mu_nu tr U_mu_nu(x)
-    static void sitePlaquette(ComplexField &Plaq, const std::vector<GaugeMat> &U) {
-        ComplexField sitePlaq(U[0].Grid());
-        Plaq = Zero();
-        // Nd=4 and Nc=3 are set as global constants in QCD.h
-        for (int mu = 1; mu < Nd; mu++) {
-            for (int nu = 0; nu < mu; nu++) {
-                traceDirPlaquette(sitePlaq, U, mu, nu);
-                Plaq = Plaq + sitePlaq;
-            }
-        }
-    }
-
-    // sum_mu_nu_x Re tr U_mu_nu(x)
-    static RealD sumPlaquette(const GaugeLorentz &Umu) {
-        std::vector<GaugeMat> U(Nd, Umu.Grid());
-        for (int mu = 0; mu < Nd; mu++) {
-            // Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can
-            // access the element in the mu Lorentz index with this PeekIndex syntax.
-            U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-        }
-        ComplexField Plaq(Umu.Grid());
-        sitePlaquette(Plaq, U);
-        // I guess this should be the line that sums over all space-time sites.
-        auto Tp = sum(Plaq);
-        // Until now, we have been working with objects inside the tensor nest. This TensorRemove gets
-        // rid of the tensor nest to return whatever is inside.
-        auto p  = TensorRemove(Tp);
-        return p.real();
-    }
-
-    // < Re tr U_mu_nu(x) >
-    static RealD avgPlaquette(const GaugeLorentz &Umu) {
-        // Real double type
-        RealD sumplaq = sumPlaquette(Umu);
-        // gSites() is the number of global sites. there is also lSites() for local sites.
-        double vol = Umu.Grid()->gSites();
-        // The number of orientations. 4*3/2=6 for Nd=4, as known.
-        double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
-        return sumplaq / vol / faces / Nc;
-    }
-};
-
-
-// Next we show an example of how to construct an input parameter class. We first inherit
-// from Serializable. Then all class data members have to be defined using the
-// GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many
-// class data members. In the below case, we make a parameter file holding the configuration
-// name. Here, it expects the name to be labeled with "conf_name" in the configuration file. 
-struct ConfParameters: Serializable {
-    GRID_SERIALIZABLE_CLASS_MEMBERS(
-        ConfParameters,
-        std::string, conf_name);
-
-    template <class ReaderClass>
-    ConfParameters(Reader<ReaderClass>& Reader){
-        // If we are reading an XML file, it should be structured like:
-        // <grid>
-        //   <parameters>
-        //     <conf_name>l20t20b06498a_nersc.302500</conf_name>
-        //   </parameters>
-        // </grid>
-        read(Reader, "parameters", *this);
-    }
-};
-
-
-
-// This syntax lets you pass command line arguments to main. An asterisk means that what follows is
-// a pointer. Two asterisks means what follows is a pointer to an array. 
-int main (int argc, char **argv)
-{
-    // This initializes Grid. Some command line options include
-    //   --mpi n.n.n.n
-    //   --threads n
-    //   --grid n.n.n.n
-    Grid_init(&argc, &argv);
-
-    // This is where you would specify a custom lattice size, if not from the command line. Here
-    // Nd is a global quantity that is currently set to 4.
-    Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-    Coordinate mpi_layout  = GridDefaultMpi();
-    Coordinate latt_size   = GridDefaultLatt();
-
-    // Instantiate the spacetime Grid on which everything will be built.
-    GridCartesian GRID(latt_size,simd_layout,mpi_layout);
-
-    // The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD
-    // type that you can use, which will work perfectly with what follows. 
-    PeriodicGimplD::Field U(&GRID);
-
-    // Here we read in the parameter file params.json to get conf_name. The last argument is what the
-    // top organizational level is called in the param file. 
-    XmlReader Reader("Example_plaquette.xml",false, "grid");
-    ConfParameters param(Reader);  
-
-    // Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717
-    FieldMetaData header;
-    NerscIO::readConfiguration(U, header, param.conf_name);
-
-    // Let's see what we find.
-    RealD plaq = WLoops<PeriodicGimplD>::avgPlaquette(U);
-
-    // This is how you make log messages.
-    std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) << "Plaquette = " << plaq << std::endl;
-
-    // To wrap things up.
-    Grid_finalize();
-}
--- a/m4/ax_cxx_compile_stdcxx_14.m4
+++ b/m4/ax_cxx_compile_stdcxx_14.m4
@@ -1,34 +0,0 @@
-# =============================================================================
-#  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_14.html
-# =============================================================================
-#
-# SYNOPSIS
-#
-#   AX_CXX_COMPILE_STDCXX_14([ext|noext], [mandatory|optional])
-#
-# DESCRIPTION
-#
-#   Check for baseline language coverage in the compiler for the C++14
-#   standard; if necessary, add switches to CXX and CXXCPP to enable
-#   support.
-#
-#   This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX
-#   macro with the version set to C++14.  The two optional arguments are
-#   forwarded literally as the second and third argument respectively.
-#   Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for
-#   more information.  If you want to use this macro, you also need to
-#   download the ax_cxx_compile_stdcxx.m4 file.
-#
-# LICENSE
-#
-#   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
-#
-#   Copying and distribution of this file, with or without modification, are
-#   permitted in any medium without royalty provided the copyright notice
-#   and this notice are preserved. This file is offered as-is, without any
-#   warranty.
-
-#serial 5
-
-AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX])
-AC_DEFUN([AX_CXX_COMPILE_STDCXX_14], [AX_CXX_COMPILE_STDCXX([14], [$1], [$2])])
--- a/scripts/eigen-3.3.5.Tensor.patch
+++ b/scripts/eigen-3.3.5.Tensor.patch
@@ -0,0 +1,19 @@
+--- ./Eigen/unsupported/Eigen/CXX11/Tensor	2018-07-23 10:33:42.000000000 +0100
+++ Tensor	2018-08-28 16:15:56.000000000 +0100
+@@ -25,7 +25,7 @@
+ #include <utility>
+ #endif
+ 
+-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+ 
+ #include "../SpecialFunctions"
+ #include "src/util/CXX11Meta.h"
+@@ -147,6 +147,6 @@
+ 
+ #include "src/Tensor/TensorIO.h"
+ 
+-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+ 
+ //#endif // EIGEN_CXX11_TENSOR_MODULE
--- a/scripts/prequisites.sh
+++ b/scripts/prequisites.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+if [ $1 = "install" ]
+then
+    dir=`pwd`
+    cd $HOME
+    git clone -c feature.manyFiles=true https://github.com/spack/spack.git
+    source $HOME/spack/share/spack/setup-env.sh
+
+    spack install autoconf
+    spack install automake
+    spack install c-lime cppflags=-fPIE
+    spack install fftw
+    spack install llvm
+    spack install gmp
+    spack install mpfr
+    spack install cuda@11.8
+    spack install openmpi
+    spack install openssl
+    spack install hdf5
+else
+    source $HOME/spack/share/spack/setup-env.sh
+fi
+
+spack load autoconf
+spack load automake
+spack load c-lime
+spack load fftw
+spack load llvm
+spack load gmp
+spack load mpfr
+spack load cuda@11.8
+spack load openmpi
+spack load openssl
+spack load hdf5
+
+export FFTW=`spack find --paths fftw    | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5    | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
+export GMP=`spack find --paths gmp      | grep ^gmp | awk '{print $2}' `
+export NVIDIA=$CUDA_HOME
+export NVIDIALIB=$NVIDIA/targets/x86_64-linux/lib/
+export LD_LIBRARY_PATH=$NVIDIALIB:$FFTW/lib/:$MPFR/lib:$LD_LIBRARY_PATH
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-
-#PBS -q EarlyAppAccess
-#PBS -l select=1024
-#PBS -l walltime=01:00:00
-#PBS -A LatticeQCD_aesp_CNDA
-
-#export OMP_PROC_BIND=spread
-#unset OMP_PLACES
-
-cd $PBS_O_WORKDIR
-
-source ../sourceme.sh
-
-cat $PBS_NODEFILE
-
-export OMP_NUM_THREADS=3
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-
-#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
-#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
-#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
-
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
-export MPICH_OFI_NIC_POLICY=GPU
-export FI_CXI_CQ_FILL_PERCENT=10
-export FI_CXI_DEFAULT_CQ_SIZE=262144
-#export FI_CXI_DEFAULT_CQ_SIZE=131072
-#export FI_CXI_CQ_FILL_PERCENT=20
-
-# 12 ppn, 32 nodes, 384 ranks
-#
-CMD="mpiexec -np 12288 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
-$CMD 
-
-CMD="mpiexec -np 12288 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf.small.cq
-
-CMD="mpiexec -np 12288 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf.cq
-
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	b7c7000d0d	Don't need the numerical rounding tolerance in multigrid	2023-12-22 18:10:23 -05:00
Peter Boyle	551f6c4edd	Synchronise changes	2023-12-22 18:09:11 -05:00
Peter Boyle	defd814750	Speed up the coarsened matrix matrix evaluation. It is block project limited. Could be sped up with calls to Batched GEMM and a data layout change.	2023-12-22 18:07:03 -05:00
Peter Boyle	3d517bbd2a	Synchronise decouple from the launch Speeds up multileg stencils	2023-12-22 18:06:13 -05:00
Peter Boyle	78ab955fec	Better padded cell exchange	2023-12-22 18:05:41 -05:00
Peter Boyle	dd13937bb6	Better opt face gather scatter	2023-12-22 18:03:38 -05:00
Peter Boyle	66a1b63aa9	Faster grid/blas layout change. Halo exchange is now the only slow part. Revisit	2023-12-21 20:50:18 -05:00
Peter Boyle	22c611bd1a	Delete temp file	2023-12-21 18:32:31 -05:00
Peter Boyle	c9bb1bf8ea	Passing new BLAs based	2023-12-21 18:31:17 -05:00
Peter Boyle	9e489887cf	General coarse multiRHS move to BLAS implementation	2023-12-21 15:24:48 -05:00
Peter Boyle	9feb801bb9	Much simpler GPU implementation	2023-12-21 15:24:06 -05:00
Peter Boyle	c00b495933	Multigrid	2023-12-21 15:23:31 -05:00
Peter Boyle	d22eebe553	BLas options	2023-12-21 15:23:03 -05:00
Peter Boyle	8bcbd82680	BLAS based layout and implementation	2023-12-21 15:21:24 -05:00
Peter Boyle	dfa617c439	Batched SGEMM/DGEMM/ZGEMM/CGEMM Hip, Cuda version and vanilla CPU One MKL stub in comments, to be tested as different.	2023-12-21 14:01:18 -05:00
Peter Boyle	48d1f0df89	Optimised partially, working	2023-12-21 12:33:47 -05:00
Peter Boyle	b75cb7a12c	Blas batched partial implementation on Frontier only for now	2023-12-21 12:31:33 -05:00
Peter Boyle	332563e037	Debugged, reducing verbose	2023-12-21 12:30:57 -05:00
Peter Boyle	0cce97a4fe	verbosity only	2023-12-20 21:30:10 -05:00
Peter Boyle	95a8e4be64	rocblas	2023-12-20 21:27:59 -05:00
Peter Boyle	abcd6b8cb6	Faster version	2023-12-19 15:17:46 -05:00
Peter Boyle	e8f21c9b6d	Memmory verbose control improvement	2023-12-19 15:16:58 -05:00
Peter Boyle	e054078b11	Verbose	2023-12-05 16:15:17 -05:00
Peter Boyle	6835a7f208	Better logging, test on 81 point stencil	2023-11-29 19:20:47 -05:00
Peter Boyle	f59993b979	Nbasis§	2023-11-29 09:47:36 -05:00
Peter Boyle	2290b8f680	Verbose	2023-11-29 09:47:04 -05:00
Peter Boyle	2c54be651c	Further updates	2023-11-29 09:43:29 -05:00
Peter Boyle	e859a199df	Reduce volume to interior for coarse stencil -- worth up to 4x gain	2023-11-28 10:23:16 -05:00
Peter Boyle	0a3682ad0b	MultiRHS work	2023-11-28 07:43:37 -05:00
Peter Boyle	59abaeb5cd	Time stamp	2023-11-24 12:56:45 -05:00
Peter Boyle	3e448435d3	Restrict to interior	2023-11-23 18:23:29 -05:00
Peter Boyle	a294bc3c5b	Relax constraints for multiRHS	2023-11-23 18:20:42 -05:00
Peter Boyle	b302ad3d49	multiRHS test in place, passes Yay!	2023-11-23 18:20:15 -05:00
Peter Boyle	82fc4b1e94	Finalise	2023-11-23 18:19:41 -05:00
Peter Boyle	b4f1740380	Finalise message	2023-11-23 18:19:16 -05:00
Peter Boyle	031f85247c	multRHS initial support -- needs optimisation for multi project/promote. Bug fix in freeing intermediate grids to stop double free	2023-11-23 18:18:35 -05:00
Peter Boyle	639cc6f73a	better support for multiRHS coarse space Still to add restriction of domain of last loop to interior of padded cell (expect about 4.5x on test volume on Crusher)	2023-11-23 18:16:26 -05:00
Peter Boyle	09946cf1ba	Improved, works on 48^3 moving to multiRHS optimisations	2023-11-15 18:03:05 -05:00
Peter Boyle	f4fa95e7cb	Use 5.3.0	2023-11-15 18:01:38 -05:00
Peter Boyle	100e29e35e	Allow expression as argument to norm2	2023-11-15 18:00:44 -05:00
Peter Boyle	4cbe471a83	devVector	2023-11-15 18:00:07 -05:00
Peter Boyle	8bece1f861	Faster to transpose the matrix and apply with column major order	2023-11-15 17:58:38 -05:00
Peter Boyle	a3ca71ec01	Lots more setup options, still working on them	2023-11-15 17:58:04 -05:00
Peter Boyle	e0543e8af5	Implement flexible preconditioned CG	2023-11-15 17:57:39 -05:00
Peter Boyle	c1eb80d01a	Print which have converged	2023-11-15 17:57:08 -05:00
Peter Boyle	a26121d97b	Better printing	2023-11-15 17:56:45 -05:00
Peter Boyle	043031a757	Report resid on failed convergence	2023-11-15 17:56:22 -05:00
Peter Boyle	807aeebe4c	Resize tol in constructor	2023-11-15 17:55:57 -05:00
Peter Boyle	8aa1a37aad	For Mirs preconditioner solver	2023-11-15 17:55:32 -05:00
Peter Boyle	4efa042f50	C++17 change	2023-10-24 10:57:50 -04:00
Peter Boyle	c7cb37e970	c++17 accepted	2023-10-24 10:57:24 -04:00
Peter Boyle	d34b207eab	Avoid HIP warnings	2023-10-24 10:57:04 -04:00
Peter Boyle	0e6fa6f6b8	DOn't need the Cshift for the period optimisation	2023-10-24 10:56:31 -04:00
Peter Boyle	38b87de53f	This works around a stacksize limit on AMD GPU	2023-10-24 10:56:07 -04:00
Peter Boyle	aa5047a9e4	Faster blockProject blockPromote	2023-10-24 10:49:55 -04:00
Peter Boyle	24b6ee0df9	M4 file	2023-10-24 10:36:48 -04:00
Peter Boyle	1e79cc9cbe	Avoid compiler error	2023-10-24 10:36:09 -04:00
Peter Boyle	b3925df9c3	Verbose on CPU-GPU xfer, remove performance by default	2023-10-24 10:25:01 -04:00
Peter Boyle	351795ac3a	Better messaging	2023-10-20 19:33:04 -04:00
Peter Boyle	9c9c42d0df	Tests on frontier with real speed up . 3.5x on 16^3 at mq=0.01	2023-10-20 19:27:13 -04:00
Peter Boyle	b6ad1bafc7	Normal memory SendToRecvFrom asynchronous for use in general stencil code	2023-10-20 19:27:13 -04:00
Peter Boyle	a5ca40f446	Better verbose -- track CPU GPU motion under --log Memory, others go to debug output stream	2023-10-20 19:27:13 -04:00
Peter Boyle	9ab54c5565	Overlap comms & data copy/buffer assembly in Ghost zone exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	4341d96bde	Massively sped up coarse grid mult, comms Save 3ms spend (60% of time !) on cudaMalloc !!	2023-10-20 19:27:13 -04:00
Peter Boyle	5fac47a26d	Faster halo exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	e064f17346	Faster halo exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	afe10ba2a2	More digits	2023-10-20 19:27:13 -04:00
Peter Boyle	7cc3435ba8	Imporved General coarsened matrix	2023-10-20 19:27:13 -04:00
Peter Boyle	541772313c	Verbosity	2023-10-20 19:27:13 -04:00
Peter Boyle	3747494a09	Notify delet public	2023-10-20 19:27:13 -04:00
Peter Boyle	f2b98d0dcc	Const safety	2023-10-20 19:27:13 -04:00
Peter Boyle	80471bf762	Alternate implementation involving face operations	2023-10-20 19:27:13 -04:00
Peter Boyle	a06f63c110	Improved I/O and non-lexico option exposed to SciDAC format	2023-10-20 19:27:13 -04:00
Peter Boyle	0ae4478cd9	Checkpoint the subspace and ldop	2023-10-20 19:27:13 -04:00
Peter Boyle	ae4e705e09	Use random vec as easier for debug	2023-10-20 19:27:13 -04:00
Peter Boyle	f5dcea9dbf	Updates for Frontier	2023-10-20 19:27:12 -04:00
Peter Boyle	2207309f8a	Spack rules	2023-10-16 18:38:24 -04:00
Peter Boyle	2111e7ab5f	Run at physical mass	2023-10-06 21:20:21 -04:00
Peter Boyle	d29abfdcaf	Transfer code to Frontier now	2023-10-06 21:03:34 -04:00
Peter Boyle	a751c42cc5	Checkpoint restore the setup	2023-10-06 21:03:08 -04:00
Peter Boyle	6a3bc9865e	Verbose change	2023-10-06 21:02:04 -04:00
Peter Boyle	4d5f7e4377	Verbose change	2023-10-06 21:01:37 -04:00
Peter Boyle	78b117fb78	Comment fix	2023-10-06 21:01:15 -04:00
Peter Boyle	ded63a1319	Verbose change/pretty print	2023-10-06 21:00:53 -04:00
Peter Boyle	df3e4d1e9c	Return fix	2023-10-06 21:00:21 -04:00
Peter Boyle	b58fd80379	I/O for coarse op and reorganise multigrid headers	2023-10-06 13:43:46 -04:00
Peter Boyle	7f6e0f57d0	No IO in file	2023-10-06 13:39:53 -04:00
Peter Boyle	cae27678d8	gpermute	2023-10-06 13:39:19 -04:00
Peter Boyle	48ff655bad	Slightly less verbose	2023-10-06 10:47:52 -04:00
Peter Boyle	2525ad4623	Slight clean up	2023-10-06 10:47:32 -04:00
Peter Boyle	e7020017c5	Reorganise multigrid	2023-10-06 10:47:12 -04:00
Peter Boyle	eacebfad74	Reorganise multigrid into multiple headers	2023-10-06 10:46:21 -04:00
Peter Boyle	3bc2da5321	Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1	2023-10-05 16:57:59 -04:00
Peter Boyle	2d710d6bfd	Optimised parameters for 16^3	2023-10-05 16:56:55 -04:00
Peter Boyle	6532b7f32b	Eliminate older inefficient coarsening implementation	2023-10-05 16:56:15 -04:00
Peter Boyle	7b41b92d99	Only need to bad non-local dimensions	2023-10-05 16:55:48 -04:00
Peter Boyle	dd557af84b	ADEF1 and ADEF2 2 level CG	2023-10-05 16:55:19 -04:00
Peter Boyle	59b9d0e030	coalesceRead the blockSum	2023-10-05 16:54:48 -04:00
Peter Boyle	b82eee4733	Hermitian dealing with	2023-10-05 16:54:14 -04:00
Peter Boyle	6a87487544	Running on Frontier, fix RNG big volume y2k, affecting 5D RNG	2023-10-05 16:50:59 -04:00
Peter Boyle	fcf5023845	Running on Frontier	2023-10-05 16:50:59 -04:00
Peter Boyle	c8adad6d8b	First runs on Summit. PopulateAdag needs work	2023-10-05 16:50:54 -04:00
Peter Boyle	737d3ffb98	ADEF1 and 1 hop projection	2023-10-03 14:22:18 -04:00
Peter Boyle	b01e67bab1	coalescedReadGeneralPermute now working	2023-10-02 17:46:57 -04:00
Peter Boyle	8a70314f54	Merge branch 'develop' into feature/scidac-wp1	2023-10-02 17:24:55 -04:00
Peter Boyle	36ae6e5aba	Fastest GPU version. Need to work on the PaddedCell now to make much faster	2023-09-29 18:26:51 -04:00
Peter Boyle	9db585cfeb	Temporary commit while optimisation is carried out	2023-09-29 17:11:35 -04:00
Peter Boyle	c564611ba7	Annoying hack that is useful to preserve for profiling	2023-09-29 17:11:12 -04:00
Peter Boyle	e187bcb85c	Updating	2023-09-29 17:10:17 -04:00
Peter Boyle	be18ffe3b4	Further tuning and lanczos	2023-09-27 16:21:58 -04:00
Peter Boyle	0d63dce4e2	Timing info	2023-09-27 16:21:14 -04:00
Peter Boyle	26b30e1551	Flop count and projection to nearest neighbour (keeps redundant flops)	2023-09-27 16:20:11 -04:00
Peter Boyle	7fc58ac293	Verbose subspace init	2023-09-27 16:19:45 -04:00
Peter Boyle	3a86cce8c1	Compile	2023-09-27 16:19:18 -04:00
Peter Boyle	37884d369f	Coarse space is expensive, but gives a speed up in fine matrix multiplies now. Down to optimisation	2023-09-25 17:24:19 -04:00
Peter Boyle	9246e653cd	Basic non-local coarsening of operator test	2023-09-25 17:20:58 -04:00
Peter Boyle	64283c8673	Normal equations becomes linear function for easy base class pass aroudn	2023-09-25 17:19:39 -04:00
Peter Boyle	755002da9c	Comparison convenience	2023-09-25 17:16:33 -04:00
Peter Boyle	31b8e8b437	Better messaging	2023-09-25 17:16:14 -04:00
Peter Boyle	0ec0de97e6	Adef2 implemented and working in an HDCG like context	2023-09-25 17:15:03 -04:00
Peter Boyle	6c3ade5d89	Improved the coarsening	2023-09-25 17:14:40 -04:00
Peter Boyle	980c5f9a34	Update chebyshev setup	2023-09-25 17:12:22 -04:00
Peter Boyle	471ca5f281	Power method more iterations	2023-09-07 10:55:05 -04:00
Peter Boyle	e82ddcff5d	Working getting closer to HDCG but some low level engineering work still needed + MUCH work on optimisation	2023-09-07 10:53:51 -04:00
Peter Boyle	b9dcad89e8	Test cases for coarsening with non-local stencil	2023-09-07 10:53:22 -04:00
Peter Boyle	993f43ef4a	Even odd use case	2023-09-07 10:53:06 -04:00
Peter Boyle	2b43308208	First cut non-local coarsening	2023-08-25 17:38:07 -04:00
Peter Boyle	04a1ac3a76	First cut for non-local coarsening	2023-08-25 17:37:38 -04:00
Peter Boyle	990b8798bd	Merge remote-tracking branch 'refs/remotes/origin/develop' into develop	2023-08-25 17:36:45 -04:00
Peter Boyle	b334a73a44	Stencil improvement	2023-08-25 17:35:10 -04:00
Peter Boyle	5d113d1c70	Odd address sanitizer complain	2023-08-25 17:34:18 -04:00
Peter Boyle	c14977aeab	Random vector option for test purposes	2023-08-25 17:33:31 -04:00
Peter Boyle	3e94838204	Spread out improvement	2023-08-25 17:31:28 -04:00
Peter Boyle	c0a0b8ca62	NEON and address sanitiser	2023-08-25 17:30:30 -04:00