T-direction terms done

Gauge staples for temporal direction added (ico-T staples and T-ico staples).
Passes gauge covariance test, requiring the link x its staples = 1 on a random gauge transform. Still to do: temporal link double store temporal laplacian + covariant laplacian terms Make the "rotate" -> coalescedReadRotate as presently it is hardwired CPU domain in a couple of places in Test_icosahedron
2026-03-02 02:26:12 +00:00 · 2025-11-11 08:36:57 -05:00 · 2025-11-06 13:54:39 -05:00 · 2025-10-27 21:09:02 -04:00 · 2025-10-27 19:19:30 -04:00 · 2025-10-22 21:44:51 -04:00
152 changed files with 11332 additions and 832 deletions
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -37,6 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
+#include <Grid/qcd/spin/Pauli.h> // depends on Gparity
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -191,7 +191,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
-    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,7 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@@ -251,7 +251,7 @@ public:
      }
    }
      
-    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
-    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@@ -291,7 +291,7 @@ public:
    }
    result = result*div;
      
-    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -277,6 +277,38 @@ public:
    assert(0);
  }
 };
+template<class Matrix,class Field>
+class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD shift;
+public:
+  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    out = out + shift*in;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -269,7 +269,9 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
+    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
+    grid->Barrier();

    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
      } else { 
 	assert(0);
      }
@@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@@ -144,11 +144,11 @@ public:
      acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
    }    
    RealD t4 = usecond();
-    std::cout << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
-    std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
-    std::cout << "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
-    std::cout << "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
-    std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance << "MulMatrix alloc    took "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix blas     took "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix copy     took "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
  }
  
  void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@@ -242,16 +242,16 @@ public:
    RealD flops = 8.0*M*N*K;
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp   t6 "<< t6-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #else
    int nrhs;
    GridBase *grid;
@@ -358,17 +358,17 @@ public:
    flops = flops/(t4-t3)/1.e3;
    bytes = bytes/(t4-t3)/1.e3;
    xybytes = 4*xybytes/(t2-t1)/1.e3;
-    std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
-    std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
-    std::cout << "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
-    std::cout << "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
-    std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp    t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< flops<<" GF/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix blas    "<< bytes<<" GB/s"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix cp     t5 "<< t5-t4<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix lsum   t6l "<< t6l-t5<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix gsum   t6 "<< t6-t6l<<" us"<<std::endl;
+    std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
 #endif
  }
 };
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@@ -63,7 +63,12 @@ class TwoLevelCGmrhs
  GridStopWatch SmoothTimer;
  GridStopWatch InsertTimer;

-  
+  /*
+    Field rrr;
+  Field sss;
+  Field qqq;
+  Field zzz;
+  */  
  // more most opertor functions
  TwoLevelCGmrhs(RealD tol,
 		 Integer maxit,
@@ -74,6 +79,12 @@ class TwoLevelCGmrhs
    MaxIterations(maxit),
    _FineLinop(FineLinop),
    _Smoother(Smoother)
+    /*
+    rrr(fine),
+    sss(fine),
+    qqq(fine),
+    zzz(fine)
+*/
  {
    grid       = fine;
  };
@@ -81,8 +92,8 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
-    SolveSingleSystem(src,x);
-    //    SolvePrecBlockCG(src,x);
+    //    SolveSingleSystem(src,x);
+    SolvePrecBlockCG(src,x);
  }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -657,6 +668,8 @@ public:
    CoarseField PleftProjMrhs(this->coarsegridmrhs);
    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);

+    //    this->rrr=in[0];
+
 #undef SMOOTHER_BLOCK_SOLVE
 #if SMOOTHER_BLOCK_SOLVE
    this->SmoothTimer.Start();
@@ -669,6 +682,7 @@ public:
      this->SmoothTimer.Stop();
    }
 #endif
+    //    this->sss=Min[0];
    
    for(int rhs=0;rhs<nrhs;rhs++) {
      
@@ -705,9 +719,11 @@ public:
    this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]  
    this->PromoteTimer.Stop();
    this->FineTimer.Start();
+    //    this->qqq=tmp[0];
    for(int rhs=0;rhs<nrhs;rhs++) {
      axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
    }
+    //    this->zzz=out[0];
    this->FineTimer.Stop();
  }
 };
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
+    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -74,7 +74,7 @@ public:

  void operator() (const Field &src, Field &psi){

-    psi=Zero();
+    //    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once

+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+
 NAMESPACE_BEGIN(Grid);

 inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@@ -108,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){

 	CG(hermop,noise,subspace[b]);

@@ -124,6 +126,53 @@ public:
    }
  }

+  virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
+  {
+    RealD scale;
+
+    TrivialPrecon<FineField> simple_fine;
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    FineField noise(FineGrid);
+    FineField src(FineGrid);
+    FineField guess(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
+
+      for(int i=0;i<2;i++){
+	//  void operator() (const Field &src, Field &psi){
+#if 1
+	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	src = noise;
+	guess=Zero();
+	GCR(src,guess);
+	subspace[b] = guess;
+#else
+	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	src=Zero();
+	guess = noise;
+	GCR(src,guess);
+	subspace[b] = guess;
+#endif
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
@@ -160,14 +209,21 @@ public:

    int b =0;
    {
+      ComplexD ip;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+      hermop.Op(Mn,tmp);
+      ip= innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+      hermop.AdjOp(Mn,tmp); 
+      ip = innerProduct(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
      b++;
    }

@@ -213,8 +269,18 @@ public:
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+
+	  ComplexD ip;
+
+	  hermop.Op(Mn,tmp);
+	  ip= innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+
+	  hermop.AdjOp(Mn,tmp); 
+	  ip = innerProduct(Mn,tmp); 
+	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
+	  
 	  b++;
 	}

@@ -228,6 +294,70 @@ public:
    }
    assert(b==nn);
  }
+
+
+  virtual void CreateSubspacePolyCheby(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo1,
+				       int orderfilter,
+				       double lo2,
+				       int orderstep)
+  {
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
+    // Initial matrix element
+    hermop.Op(noise,Mn);
+    std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    for(int n=1;n<nn;n++){
+      std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
+      Chebyshev<FineField> Cheb(lo2,hi,orderstep);
+      Cheb(hermop,subspace[n-1],Mn);
+
+      for(int m=0;m<n;m++){
+	ComplexD c = innerProduct(subspace[m],Mn);
+	Mn = Mn - c*subspace[m];
+      }
+      
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5);
+      Mn=Mn*scale;
+      
+      subspace[n]=Mn;
+      
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
+
+    }
+  }
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
+  //////////////////////////////////////////////////////////////////////
+  // Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    CoarsenOperator(linop,Subspace,Subspace);
+  }
+  //////////////////////////////////////////////////////////////////////
+  // Petrov - Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & U,
+		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
+    blockOrthogonalise(InnerProd,U.subspace);

-    //    for(int s=0;s<Subspace.subspace.size();s++){
-      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
-    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -69,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
@@ -175,10 +175,11 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
 template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
 template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector

+/*
 template<class T> class vecView
 {
 protected:
@@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
+//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
 	  (uint64_t)AccCache.bytes,
 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/cartesian/Cartesian.h
+++ b/Grid/cartesian/Cartesian.h
@@ -31,5 +31,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cartesian/Cartesian_base.h>
 #include <Grid/cartesian/Cartesian_full.h>
 #include <Grid/cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/CartesianCrossIcosahedron.h>

 #endif
--- a/Grid/cartesian/CartesianCrossIcosahedron.h
+++ b/Grid/cartesian/CartesianCrossIcosahedron.h
@@ -0,0 +1,241 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
+
+    Copyright (C) 2025
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+    
+/////////////////////////////////////////////////////////////////////////////////////////
+// Grid Support.
+/////////////////////////////////////////////////////////////////////////////////////////
+
+enum IcosahedralMeshType {
+  IcosahedralVertices,
+  IcosahedralEdges
+} ;
+enum NorthSouth {
+  North = 1,
+  South = 0
+};
+enum IcoshedralDirections {
+  IcosahedronPatchX = 0,
+  IcosahedronPatchY = 1,
+  IcosahedronPatchDiagonal=2,
+  NumIcosahedralPolarizations
+};
+
+const int IcosahedralPatches = 10;
+const int HemiPatches=IcosahedralPatches/2;
+const int NorthernHemisphere = HemiPatches;
+const int SouthernHemisphere = 0;
+
+class GridCartesianCrossIcosahedron: public GridCartesian {
+
+public:
+
+  IcosahedralMeshType meshType;
+
+  IcosahedralMeshType MeshType(void) { return meshType; };
+  
+  /////////////////////////////////////////////////////////////////////////
+  // Constructor takes a parent grid and possibly subdivides communicator.
+  /////////////////////////////////////////////////////////////////////////
+  /*
+  GridCartesian(const Coordinate &dimensions,
+		const Coordinate &simd_layout,
+		const Coordinate &processor_grid,
+		const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
+  {
+    assert(0); // No subdivision
+  }
+  GridCartesian(const Coordinate &dimensions,
+		const Coordinate &simd_layout,
+		const Coordinate &processor_grid,
+		const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
+  {
+    assert(0); // No subdivision
+  }
+  */
+  /////////////////////////////////////////////////////////////////////////
+  // Construct from comm world
+  /////////////////////////////////////////////////////////////////////////
+  GridCartesianCrossIcosahedron(const Coordinate &dimensions,
+				const Coordinate &simd_layout,
+				const Coordinate &processor_grid,
+				IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
+  {
+    meshType = _meshType;
+    Coordinate S2dimensions=dimensions;
+    Coordinate S2simd      =simd_layout;
+    Coordinate S2procs     =processor_grid;
+
+    assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
+    assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
+    assert(dimensions[_ndimension-1]==IcosahedralPatches);
+    assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
+
+    // Save a copy of the basic cartesian initialisation volume
+    cartesianOsites = this->_osites;
+
+    // allocate the pole storage if we are seeking vertex domain data
+    if ( meshType == IcosahedralVertices ) {
+      InitPoles();
+    }
+  }
+
+  virtual ~GridCartesianCrossIcosahedron() = default;
+
+  ////////////////////////////////////////////////
+  // Use to decide if a given grid is icosahedral
+  ////////////////////////////////////////////////
+  int hasNorthPole;
+  int hasSouthPole;
+  int northPoleOsite;
+  int southPoleOsite;
+  int northPoleOsites;
+  int southPoleOsites;
+  int cartesianOsites;
+
+  virtual int isIcosahedral(void)           override { return 1;}
+  virtual int isIcosahedralVertex(void)     override { return meshType==IcosahedralVertices;}
+  virtual int isIcosahedralEdge  (void)     override { return meshType==IcosahedralEdges;}
+  virtual int NorthPoleOsite(void)  const override { return northPoleOsite; };
+  virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
+  virtual int SouthPoleOsite(void)  const override { return southPoleOsite; };
+  virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
+  virtual int ownsNorthPole(void)   const override { return hasNorthPole; };
+  virtual int ownsSouthPole(void)   const override { return hasSouthPole; };
+  virtual int CartesianOsites(void) const override { return cartesianOsites; };
+  virtual int64_t PoleIdxForOcoor(Coordinate &Coor) override
+  {
+    // Work out the pole_osite. Pick the higher dims
+    Coordinate rdims;
+    Coordinate ocoor;
+    int64_t pole_idx;
+    int Ndm1 = this->Nd()-1;
+    for(int d=2;d<Ndm1;d++){
+      int dd=d-2;
+      rdims.push_back(this->_rdimensions[d]);
+      ocoor.push_back(Coor[d]%this->_rdimensions[d]);
+    }
+    Lexicographic::IndexFromCoor(ocoor,pole_idx,rdims);
+    return pole_idx;
+  }
+  virtual int64_t PoleSiteForOcoor(Coordinate &Coor) override
+  {
+    int Ndm1 = this->Nd()-1;
+    int64_t pole_idx = this->PoleIdxForOcoor(Coor);
+    int64_t pole_osite;
+    if ( Coor[Ndm1] >= HemiPatches ) {
+      pole_osite = pole_idx + this->NorthPoleOsite();
+    } else {
+      pole_osite = pole_idx + this->SouthPoleOsite();
+    }
+    return pole_osite;
+  }
+
+
+  void InitPoles(void)
+  {
+    int Ndm1 = _ndimension-1;
+    ///////////////////////
+    // Add the extra pole storage
+    ///////////////////////
+    // Vertices = 1x LxLx D1...Dn + 2.D1...Dn
+    // Start after the LxL and don't include the 10 patch dim
+    int OrthogSize = 1;
+    for (int d = 2; d < Ndm1; d++) {
+      OrthogSize *= _gdimensions[d];
+    }
+    _fsites += OrthogSize*2;
+    _gsites += OrthogSize*2;
+
+    // Simd reduced sizes are multiplied up.
+    // If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
+    // which should contain identical north (south) pole data
+    OrthogSize = 1;
+    for (int d = 2; d < Ndm1; d++) {
+      OrthogSize *= _rdimensions[d];
+    }
+
+    // Grow the local volume to hold pole data
+    // on rank (0,0) in the LxL planes
+    // since SIMD must be placed in the orthogonal directions
+    Coordinate pcoor = this->ThisProcessorCoor();
+    Coordinate pgrid = this->ProcessorGrid();
+
+    const int xdim=0;
+    const int ydim=1;
+    /*
+     *
+     *  /\/\/\/\/\
+     * /\/\/\/\/\/
+     * \/\/\/\/\/
+     *
+     *  y
+     * /
+     * \x
+     *
+     * Labelling patches as 5 6 7 8 9
+     *                      0 1 2 3 4
+     *
+     * Will ban distribution of the patch dimension by more than 2.
+     *
+     * Hence all 5 patches associated with the pole must have the
+     * appropriate "corner" of the patch L^2 located on the SAME rank.
+     */ 
+    
+    if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
+      hasSouthPole   =1;
+      southPoleOsite=this->_osites; 
+      southPoleOsites=OrthogSize;
+      this->_osites += OrthogSize;
+    } else {
+      hasSouthPole   =0;
+      southPoleOsites=0;
+      southPoleOsite=0;
+    }
+    if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
+      hasNorthPole   =1;
+      northPoleOsite=this->_osites;
+      northPoleOsites=OrthogSize;
+      this->_osites += OrthogSize;
+    } else {
+      hasNorthPole   =0;
+      northPoleOsites=0;
+      northPoleOsite=0;
+    }
+    std::cout << GridLogDebug<<"Icosahedral vertex field volume " << this->_osites<<std::endl;
+    std::cout << GridLogDebug<<"Icosahedral south pole offset   " << this->southPoleOsite<<std::endl;
+    std::cout << GridLogDebug<<"Icosahedral north pole offset   " << this->northPoleOsite<<std::endl;
+    std::cout << GridLogDebug<<"Icosahedral south pole size     " << this->southPoleOsites<<std::endl;
+    std::cout << GridLogDebug<<"Icosahedral north pole size     " << this->northPoleOsites<<std::endl;
+  };
+
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -86,10 +86,25 @@ public:

 public:

+  // Icosahedral decisions
+  virtual int isIcosahedral(void) { return 0;}
+  virtual int isIcosahedralVertex(void) { return 0;}
+  virtual int isIcosahedralEdge  (void) { return 0;}
+  virtual int ownsNorthPole(void) const { return 0; };
+  virtual int ownsSouthPole(void) const { return 0; };
+  virtual int NorthPoleOsite(void) const { return 0; };
+  virtual int SouthPoleOsite(void) const { return 0; };
+  virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
+  virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
+  virtual int CartesianOsites(void) const { return this->oSites(); };
+  virtual int64_t PoleIdxForOcoor(Coordinate &Coor) { return 0;};
+  virtual int64_t PoleSiteForOcoor(Coordinate &Coor){ return 0;}
+
  ////////////////////////////////////////////////////////////////
  // Checkerboarding interface is virtual and overridden by 
  // GridCartesian / GridRedBlackCartesian
  ////////////////////////////////////////////////////////////////
+
  virtual int CheckerBoarded(int dim) =0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
@@ -176,6 +191,8 @@ public:
    }
    return permute_type;
  }
+
+  
  ////////////////////////////////////////////////////////////////
  // Array sizing queries
  ////////////////////////////////////////////////////////////////
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

+#define NVLINK_GET
+
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@@ -127,7 +129,7 @@ public:
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
-  
+
  template<class obj> void GlobalSumP2P(obj &o)
  {
    std::vector<obj> column;
@@ -147,7 +149,8 @@ public:
 			    sizeof(obj),d*100+p);

      }
-      CommsComplete(list);
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
@@ -192,6 +195,11 @@ public:
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
+
+  // Could do a PollHtoD and have a CommsMerge dependence
+  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
+  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@@ -259,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("AllReduce");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -362,8 +370,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<MpiCommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);

  int myrank = _processor;
  int ierr;
@@ -379,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -399,6 +402,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,


 #ifdef ACCELERATOR_AWARE_MPI
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
@@ -440,8 +445,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
+#ifdef NVLINK_GET
+    else { 
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+    }
+#endif
  }
-  
+  // This is a NVLINK PUT  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
@@ -450,9 +462,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
+#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+#endif
    }
  }
  return off_node_bytes;
@@ -461,7 +475,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
-
+  /*finishes Get/Put*/
  acceleratorCopySynchronise();

  if (nreq==0) return;
@@ -561,53 +575,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#undef DEVICE_TO_HOST_CONCURRENT // pipeline
-#ifdef DEVICE_TO_HOST_CONCURRENT
+
      tag= dir+_processor*32;

      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
      
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;

-      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
+      srq.tag        = tag;
+      srq.dest       = dest;
+      srq.commdir    = commdir;
      list.push_back(srq);
-#else
-      tag= dir+_processor*32;
-
-      host_xmit = this->HostBufferMalloc(xbytes);
-      const int chunks=1;
-      for(int n=0;n<chunks;n++){
-	void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
-	void * xmitc      = (void *)( (uint64_t) xmit      + n*xbytes/chunks);
-	acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
-      }
-      acceleratorCopySynchronise(); // Complete all pending copy transfers
-      
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      off_node_bytes+=xbytes;
-
-      CommsRequest_t srq;
-      srq.PacketType = InterNodeXmit;
-      srq.bytes      = xbytes;
-      srq.req        = xrq;
-      srq.host_buf   = host_xmit;
-      srq.device_buf = xmit;
-      list.push_back(srq);
-#endif
    }
  }

  return off_node_bytes;
 }
+/*
+ * In the interest of better pipelining, poll for completion on each DtoH and 
+ * start MPI_ISend in the meantime
+ */
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeRecv ) {
+
+	int flag = 0;
+	MPI_Status status;
+	int ierr = MPI_Test(&list[idx].req,&flag,&status);
+	assert(ierr==0);
+
+	if ( flag ) {
+	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+	  list[idx].PacketType=InterNodeReceiveHtoD;
+	} else {
+	  pending ++;
+	}
+      }
+    }
+    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
+  } while ( pending );
+  
+}
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeXmit ) {
+
+	if ( acceleratorEventIsComplete(list[idx].ev) ) {
+
+	  void *host_xmit = list[idx].host_buf;
+	  uint32_t xbytes = list[idx].bytes;
+	  int dest        = list[idx].dest;
+	  int tag         = list[idx].tag;
+	  int commdir     = list[idx].commdir;
+	  ///////////////////
+	  // Send packet
+	  ///////////////////
+
+	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
+	  
+	  MPI_Request xrq;
+	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+	  assert(ierr==0);
+
+	  list[idx].req        = xrq; // Update the MPI request in the list
+
+	  list[idx].PacketType=InterNodeXmitISend;
+
+	} else {
+	  // not done, so return to polling loop
+	  pending++;
+	}
+      }
+    }
+  } while (pending);
+}  

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
@@ -644,69 +710,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   * - complete all copies
   * - post MPI send asynch
   */
+#ifdef NVLINK_GET
+  if ( dor ) {

-  //  static int printed;
-  //  if((printed<8) && this->IsBoss() ) {
-  //    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
-  //    printed++;
-  //  }
-  
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+
+      srq.PacketType = IntraNodeRecv;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+    }
+  }  
+#else
  if (dox) {

-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#ifdef DEVICE_TO_HOST_CONCURRENT
-      tag= dir+_processor*32;
-      // Find the send in the prepared list
-      int list_idx=-1;
-      for(int idx = 0; idx<list.size();idx++){
-
-	if ( (list[idx].device_buf==xmit)
-	   &&(list[idx].PacketType==InterNodeXmit)
-	   &&(list[idx].bytes==xbytes) ) {
-
-	  list_idx = idx;
-	  host_xmit = list[idx].host_buf;
-	}
-      }
-      assert(list_idx != -1); // found it
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list[list_idx].req        = xrq; // Update the MPI request in the list
-      off_node_bytes+=xbytes;
-#endif      
-    } else {
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      CommsRequest_t srq;
+      
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      srq.PacketType = IntraNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+      
    }
  }
+#endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D

-  if (nreq==0) return;
-  std::vector<MPI_Status> status(nreq);
-  std::vector<MPI_Request> MpiRequests(nreq);
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
+    
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }

-  for(int r=0;r<nreq;r++){
-    MpiRequests[r] = list[r].req;
+  int nreq=MpiRequests.size();
+
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
  }
  
-  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
-  assert(ierr==0);
-
-  for(int r=0;r<nreq;r++){
-    if ( list[r].PacketType==InterNodeRecv ) {
-      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
-    }
-  }
+  //  for(int r=0;r<nreq;r++){
+  //    if ( list[r].PacketType==InterNodeRecv ) {
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  //    }
+  //  }
+  
  
-  acceleratorCopySynchronise(); // Complete all pending copy transfers
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
-  this->StencilBarrier(); 
+#ifndef NVLINK_GET
+  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
+#endif   
 }
 #endif
 ////////////////////////////////////////////
@@ -715,6 +801,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque

 void CartesianCommunicator::StencilBarrier(void)
 {
+  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -722,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
+  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@@ -745,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@@ -767,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
+  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
@@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
-  return xbytes+rbytes;
+  return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
 #ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
-enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
+/*
+ * Enable state transitions as each packet flows.
+ */
+enum PacketType_t {
+  FaceGather,
+  InterNodeXmit,
+  InterNodeRecv,
+  IntraNodeXmit,
+  IntraNodeRecv,
+  InterNodeXmitISend,
+  InterNodeReceiveHtoD
+};
+/*
+ *Package arguments needed for various actions along packet flow
+ */
 typedef struct {
  PacketType_t PacketType;
  void *host_buf;
  void *device_buf;
+  int dest;
+  int tag;
+  int commdir;
  unsigned long bytes;
+  acceleratorEvent_t ev;
  MpiCommsRequest_t req;
 } CommsRequest_t;
 #endif
@@ -119,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  printf("Host buffer allocate for GPU non-aware MPI\n");
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#ifdef HAVE_NUMAIF_H
+#if 0
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
-#else   
-  bcopy(src,dest,bytes);
-#endif
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//  acceleratorCopyToDevice(src,dest,bytes);
+//#else   
+//  bcopy(src,dest,bytes);
+//#endif
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@@ -989,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //SharedMemoryTest();
+  //  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -1011,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
-    ShmBarrier();
  }
+  ShmBarrier();
+  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-  acceleratorCopyToDevice(src,dest,bytes);
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//  acceleratorCopyToDevice(src,dest,bytes);
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
+  assert(!rhs.Grid()->isIcosahedral());
+  
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;

@@ -68,7 +70,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
-#if 1
+
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@@ -125,7 +127,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-    
+#ifndef ACCELERATOR_AWARE_MPI
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  RealD tcopy=0.0;
@@ -156,16 +162,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
+
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      
      tcomms-=usecond();
      grid->Barrier();

+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+#else
+      // bouncy bouncy
+      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&hsend_buf[0],
+			   xmit_to_rank,
+			   (void *)&hrecv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
@@ -226,12 +245,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
- 
+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-
+#ifndef ACCELERATOR_AWARE_MPI
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int bytes = buffer_size*sizeof(scalar_object);

  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@@ -283,11 +306,22 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
+#else
+      // bouncy bouncy
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
+			     xmit_to_rank,
+			     (void *)&hrecv_buf[0],
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif

 	xbytes+=bytes;
 	grid->Barrier();
@@ -311,234 +345,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
-#else 
-template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  GridBase *grid=rhs.Grid();
-  Lattice<vobj> temp(rhs.Grid());
-
-  int fd              = rhs.Grid()->_fdimensions[dimension];
-  int rd              = rhs.Grid()->_rdimensions[dimension];
-  int pd              = rhs.Grid()->_processors[dimension];
-  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
-  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
-  assert(simd_layout==1);
-  assert(comm_dim==1);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
-  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
-  vobj *send_buf;
-  vobj *recv_buf;
-  {
-    grid->ShmBufferFreeAll();
-    size_t bytes = buffer_size*sizeof(vobj);
-    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-  }
-    
-  int cb= (cbmask==0x2)? Odd : Even;
-  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  for(int x=0;x<rd;x++){       
-
-    int sx        =  (x+sshift)%rd;
-    int comm_proc = ((x+sshift)/rd)%pd;
-    
-    if (comm_proc==0) {
-
-      tcopy-=usecond();
-      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
-
-    } else {
-
-      int words = buffer_size;
-      if (cbmask != 0x3) words=words>>1;
-
-      int bytes = words * sizeof(vobj);
-
-      tgather-=usecond();
-      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
-      tgather+=usecond();
-
-      //      int rank           = grid->_processor;
-      int recv_from_rank;
-      int xmit_to_rank;
-      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-
-      tcomms-=usecond();
-      //      grid->Barrier();
-
-      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
-      grid->SendToRecvFrom((void *)&send_buf[0],
-			   xmit_to_rank,
-			   (void *)&recv_buf[0],
-			   recv_from_rank,
-			   bytes);
-      xbytes+=bytes;
-      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-
-      //      grid->Barrier();
-      tcomms+=usecond();
-
-      tscatter-=usecond();
-      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
-      tscatter+=usecond();
-    }
-  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  }
-}
-
-template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  GridBase *grid=rhs.Grid();
-  const int Nsimd = grid->Nsimd();
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
-   
-  int fd = grid->_fdimensions[dimension];
-  int rd = grid->_rdimensions[dimension];
-  int ld = grid->_ldimensions[dimension];
-  int pd = grid->_processors[dimension];
-  int simd_layout     = grid->_simd_layout[dimension];
-  int comm_dim        = grid->_processors[dimension] >1 ;
-
-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
-
-  assert(comm_dim==1);
-  assert(simd_layout==2);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-
-  int permute_type=grid->PermuteType(dimension);
-
-  ///////////////////////////////////////////////
-  // Simd direction uses an extract/merge pair
-  ///////////////////////////////////////////////
-  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
-  //  int words = sizeof(vobj)/sizeof(vector_type);
-
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
-  scalar_object *  recv_buf_extract_mpi;
-  scalar_object *  send_buf_extract_mpi;
-  {
-    size_t bytes = sizeof(scalar_object)*buffer_size;
-    grid->ShmBufferFreeAll();
-    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-  }
-  for(int s=0;s<Nsimd;s++){
-    send_buf_extract[s].resize(buffer_size);
-    recv_buf_extract[s].resize(buffer_size);
-  }
-
-  int bytes = buffer_size*sizeof(scalar_object);
-
-  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
-  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
-
-  ///////////////////////////////////////////
-  // Work out what to send where
-  ///////////////////////////////////////////
-  int cb    = (cbmask==0x2)? Odd : Even;
-  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  // loop over outer coord planes orthog to dim
-  for(int x=0;x<rd;x++){       
-
-    // FIXME call local permute copy if none are offnode.
-    for(int i=0;i<Nsimd;i++){       
-      pointers[i] = &send_buf_extract[i][0];
-    }
-    tgather-=usecond();
-    int sx   = (x+sshift)%rd;
-    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();
-
-    for(int i=0;i<Nsimd;i++){
-      
-      int inner_bit = (Nsimd>>(permute_type+1));
-      int ic= (i&inner_bit)? 1:0;
-
-      int my_coor          = rd*ic + x;
-      int nbr_coor         = my_coor+sshift;
-      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-
-      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
-      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
-      int nbr_lane = (i&(~inner_bit));
-
-      int recv_from_rank;
-      int xmit_to_rank;
-
-      if (nbr_ic) nbr_lane|=inner_bit;
-
-      assert (sx == nbr_ox);
-
-      if(nbr_proc){
-	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
-	tcomms-=usecond();
-	//	grid->Barrier();
-
-	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
-	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
-			     xmit_to_rank,
-			     (void *)recv_buf_extract_mpi,
-			     recv_from_rank,
-			     bytes);
-	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
-	xbytes+=bytes;
-
-	//	grid->Barrier();
-	tcomms+=usecond();
-	rpointers[i] = &recv_buf_extract[i][0];
-      } else { 
-	rpointers[i] = &send_buf_extract[nbr_lane][0];
-      }
-
-    }
-    tscatter-=usecond();
-    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-
-  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
-  }
-}
-#endif

 NAMESPACE_END(Grid); 

--- a/Grid/cshift/Cshift_none.h
+++ b/Grid/cshift/Cshift_none.h
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
+  assert(!rhs.Grid()->isIcosahedral());
  Lattice<vobj> ret(rhs.Grid());
  ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
  Cshift_local(ret,rhs,dimension,shift);
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 0
+#if 1
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
@@ -373,14 +373,17 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int64_t g=0;g<o.Grid()->_gsites;g++){
+  uint64_t gsites=1;
+  uint64_t polesites=0;
+  for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
+  for(int64_t g=0;g<gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

    sobj ss;
    peekSite(ss,o,gcoor);
-    stream<<"[";
+    stream<<"["<<  g<<" : ";
    for(int d=0;d<gcoor.size();d++){
      stream<<gcoor[d];
      if(d!=gcoor.size()-1) stream<<",";
@@ -388,6 +391,41 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
    stream<<"]\t";
    stream<<ss<<std::endl;
  }
+  if ( o.Grid()->isIcosahedralVertex() ) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    for(int d=2;d<o.Grid()->_ndimension-1;d++){
+      int pd=o.Grid()->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
+    }
+    for(uint64_t p=0;p<psites;p++){
+      sobj ss;
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      peekPole(ss,o,orthog,South);
+      stream<<"[ SouthPole : ";
+      for(int d=0;d<orthog.size();d++){
+	stream<<orthog[d];
+	if(d!=orthog.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+    for(uint64_t p=0;p<psites;p++){
+      sobj ss;
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      peekPole(ss,o,orthog,North);
+      stream<<"[ NorthPole : ";
+      for(int d=0;d<orthog.size();d++){
+	stream<<orthog[d];
+	if(d!=orthog.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+  }
  return stream;
 }
  
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -34,22 +34,86 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  typedef typename iobj::scalar_type scalar_type;
  typedef typename iobj::vector_type vector_type;

+  l=Zero();
+  
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();

-  autoView(l_v, l, CpuWrite);
-  thread_for( o, grid->oSites(), {
-    vector_type vI;
-    Coordinate gcoor;
-    ExtractBuffer<scalar_type> mergebuf(Nsimd);
-    for(int i=0;i<grid->iSites();i++){
-      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
-      mergebuf[i]=(Integer)gcoor[mu];
+  int cartesian_vol = grid->oSites();
+  if ( grid->isIcosahedral() ) {
+    cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
+  }
+  {
+    autoView(l_v, l, CpuWrite);
+    thread_for( o, cartesian_vol, {
+	vector_type vI;
+	Coordinate gcoor;
+	ExtractBuffer<scalar_type> mergebuf(Nsimd);
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  mergebuf[i]=(Integer)gcoor[mu];
+	}
+	merge<vector_type,scalar_type>(vI,mergebuf);
+	l_v[o]=vI;
+      });
+  }
+
+  if (grid->isIcosahedralVertex()) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    typename iobj::scalar_object ss;
+    for(int d=2;d<grid->_ndimension-1;d++){
+      int pd=grid->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
    }
-    merge<vector_type,scalar_type>(vI,mergebuf);
-    l_v[o]=vI;
-  });
+    for(uint64_t p=0;p<psites;p++){
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+
+      int icoor;
+      if ( mu>=2 && mu < grid->_ndimension-1) {
+	icoor = orthog[mu-2];
+      } else {
+	icoor = -1;
+      }
+
+      ss=scalar_type(icoor);
+
+      pokePole(ss,l,orthog,South);
+      pokePole(ss,l,orthog,North);
+    }
+  }
+};
+template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
+{
+  typedef typename iobj::scalar_object sobj;
+  typedef typename iobj::scalar_type scalar_type;
+  typedef typename iobj::vector_type vector_type;
+
+  GridBase *grid = l.Grid();
+
+  l=Zero();
+
+  assert(grid->isIcosahedralVertex());
+  
+  if (grid->isIcosahedralVertex()) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    sobj ss;
+    scalar_type one(1.0);
+    ss=one;
+    for(int d=2;d<l.Grid()->_ndimension-1;d++){
+      int pd=l.Grid()->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
+    }
+    for(uint64_t p=0;p<psites;p++){
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      pokePole(ss,l,orthog,pole);
+    }
+  }
 };

 NAMESPACE_END(Grid);
-
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  grid->GlobalCoorToRankIndex(rank,odx,idx,site);

  ExtractBuffer<sobj> buf(Nsimd);
-  autoView( l_v , l, CpuWrite);
+  autoView( l_v , l, CpuRead);
  extract(l_v[odx],buf);

  s = buf[idx];
@@ -151,6 +151,261 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  return;
 };

+// zero for south pole, one for north pole
+template<class vobj,class sobj>
+void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  s=Zero();
+  
+  GridBase *grid=l.Grid();
+
+  assert(grid->isIcosahedral());
+  assert(grid->isIcosahedralVertex());
+
+  int Nsimd = grid->Nsimd();
+
+  int rank;
+
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  Coordinate pcoor(grid->_ndimension);
+  for(int d=2;d<Ndm1;d++){
+    int dd=d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth == North){
+    pcoor[xdim] = 0;
+    pcoor[ydim] = pgrid[ydim]-1;
+    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+  } else {
+    pcoor[xdim] = pgrid[xdim]-1;
+    pcoor[ydim] = 0;
+    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+  }
+
+  rank = grid->RankFromProcessorCoor(pcoor);
+
+  if ( rank == grid->ThisRank() ) {
+    ExtractBuffer<sobj> buf(Nsimd);
+    autoView( l_v , l, CpuWrite);
+    extract(l_v[osite],buf);
+    s = buf[pole_isite];
+  }
+  grid->Broadcast(rank,s);
+
+  return;
+};
+template<class vobj,class sobj>
+void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  GridBase *grid=l.Grid();
+
+  assert(grid->isIcosahedral());
+  assert(grid->isIcosahedralVertex());
+
+  grid->Broadcast(grid->BossRank(),s);
+
+  int Nsimd = grid->Nsimd();
+  int rank;
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  Coordinate pcoor(grid->_ndimension,0);
+  for(int d=2;d<Ndm1;d++){
+    int dd = d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+
+    int o = orthog[dd];
+    int r = grid->_rdimensions[d];
+    int omr = o % r;
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth ==North){
+    pcoor[xdim] = 0;
+    pcoor[ydim] = pgrid[ydim]-1;
+    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+  } else {
+    pcoor[xdim] = pgrid[xdim]-1;
+    pcoor[ydim] = 0;
+    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+  }
+
+  rank = grid->RankFromProcessorCoor(pcoor);
+
+  // extract-modify-merge cycle is easiest way and this is not perf critical
+  if ( rank == grid->ThisRank() ) {
+    ExtractBuffer<sobj> buf(Nsimd);
+    autoView( l_v , l, CpuWrite);
+    extract(l_v[osite],buf);
+    buf[pole_isite] = s;
+    merge(l_v[osite],buf);
+  }
+  return;
+};
+
+
+template<class vobj,class sobj>
+void peekLocalPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  s=Zero();
+  
+  GridBase *grid=l.Grid();
+
+  assert(grid->isIcosahedral());
+  assert(grid->isIcosahedralVertex());
+
+  int Nsimd = grid->Nsimd();
+
+  int rank;
+
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  //  Coordinate pcoor(grid->_ndimension);
+  for(int d=2;d<Ndm1;d++){
+    int dd=d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    //    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth == North){
+    //    pcoor[xdim] = 0;
+    //    pcoor[ydim] = pgrid[ydim]-1;
+    //    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+    assert(grid->ownsNorthPole());
+  } else {
+    //    pcoor[xdim] = pgrid[xdim]-1;
+    //    pcoor[ydim] = 0;
+    //    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+    assert(grid->ownsSouthPole());
+  }
+
+  ExtractBuffer<sobj> buf(Nsimd);
+  autoView( l_v , l, CpuWrite);
+  extract(l_v[osite],buf);
+  s = buf[pole_isite];
+
+  return;
+};
+template<class vobj,class sobj>
+void pokeLocalPole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  GridBase *grid=l.Grid();
+
+  assert(grid->isIcosahedral());
+  assert(grid->isIcosahedralVertex());
+
+  int Nsimd = grid->Nsimd();
+  int rank;
+  int Ndm1         = grid->_ndimension-1;
+
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  //  Coordinate pcoor(grid->_ndimension,0);
+  for(int d=2;d<Ndm1;d++){
+    int dd = d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    //    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+
+    int o = orthog[dd];
+    int r = grid->_rdimensions[d];
+    int omr = o % r;
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  int insert=0;
+  if(isNorth ==North){
+    //    pcoor[xdim] = 0;
+    //    pcoor[ydim] = pgrid[ydim]-1;
+    //    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+    assert(grid->ownsNorthPole());
+  } else {
+    //    pcoor[xdim] = pgrid[xdim]-1;
+    //    pcoor[ydim] = 0;
+    //    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+    assert(grid->ownsSouthPole());
+  }
+
+  // extract-modify-merge cycle is easiest way and this is not perf critical
+  ExtractBuffer<sobj> buf(Nsimd);
+  autoView( l_v , l, CpuWrite);
+  extract(l_v[osite],buf);
+  buf[pole_isite] = s;
+  merge(l_v[osite],buf);
+  
+  return;
+};
+
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
@@ -179,7 +434,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  for(int w=0;w<words;w++){
    pt[w] = getlane(vp[w],idx);
  }
-  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
+
  return;
 };
 template<class vobj,class sobj>
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -48,31 +48,45 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////
 inline int RNGfillable(GridBase *coarse,GridBase *fine)
 {
+  if ( coarse == fine ) return 1;

-  int rngdims = coarse->_ndimension;
-
-  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
-  int lowerdims   = fine->_ndimension - coarse->_ndimension;
-  assert(lowerdims >= 0);
-  for(int d=0;d<lowerdims;d++){
-    assert(fine->_simd_layout[d]==1);
-    assert(fine->_processors[d]==1);
+  if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
+  
+  if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
+    assert(fine->Nd()==coarse->Nd());
+    for(int d=0;d<fine->Nd();d++){
+      assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
+    }
+    return 1;
  }
+    
+  {
+    
+    int rngdims = coarse->_ndimension;

-  int multiplicity=1;
-  for(int d=0;d<lowerdims;d++){
-    multiplicity=multiplicity*fine->_rdimensions[d];
-  }
-  // local and global volumes subdivide cleanly after SIMDization
-  for(int d=0;d<rngdims;d++){
-    int fd= d+lowerdims;
-    assert(coarse->_processors[d]  == fine->_processors[fd]);
-    assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
-    assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;
+    assert(lowerdims >= 0);
+    for(int d=0;d<lowerdims;d++){
+      assert(fine->_simd_layout[d]==1);
+      assert(fine->_processors[d]==1);
+    }

-    multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+    int multiplicity=1;
+    for(int d=0;d<lowerdims;d++){
+      multiplicity=multiplicity*fine->_rdimensions[d];
+    }
+    // local and global volumes subdivide cleanly after SIMDization
+    for(int d=0;d<rngdims;d++){
+      int fd= d+lowerdims;
+      assert(coarse->_processors[d]  == fine->_processors[fd]);
+      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+      assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+
+      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+    }
+    return multiplicity;
  }
-  return multiplicity;
 }

  
@@ -80,6 +94,19 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
 // this function is necessary for the LS vectorised field
 inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
 {
+
+  if ( coarse == fine ) return 1;
+
+  if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
+  
+  if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
+    assert(fine->Nd()==coarse->Nd());
+    for(int d=0;d<fine->Nd();d++){
+      assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
+    }
+    return 1;
+  }
+
  int rngdims = coarse->_ndimension;
    
  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
@@ -352,12 +379,12 @@ private:
 public:
  GridBase *Grid(void) const { return _grid; }
  int generator_idx(int os,int is) {
-    return is*_grid->oSites()+os;
+    return (is*_grid->CartesianOsites()+os)%_grid->lSites(); // On the pole sites wrap back to normal generators; Icosahedral hack
  }

  GridParallelRNG(GridBase *grid) : GridRNGbase() {
    _grid = grid;
-    _vol  =_grid->iSites()*_grid->oSites();
+    _vol  =_grid->lSites();

    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
@@ -381,7 +408,7 @@ public:

    int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
    int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l.Grid() too
-    int osites = _grid->oSites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
+    int osites = _grid->CartesianOsites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity, except on Icosahedral
    int words  = sizeof(scalar_object) / sizeof(scalar_type);

    autoView(l_v, l, CpuWrite);
@@ -402,8 +429,27 @@ public:
 	// merge into SIMD lanes, FIXME suboptimal implementation
 	merge(l_v[sm], buf);
      }
-      });
-    //    });
+    });
+
+    /*
+     * Fill in the poles for an Icosahedral vertex mesh
+     */
+    if (l.Grid()->isIcosahedralVertex()) { 
+      int64_t pole_sites=l.Grid()->NorthPoleOsites()+l.Grid()->SouthPoleOsites();
+      int64_t pole_base =l.Grid()->CartesianOsites();
+
+      ExtractBuffer<scalar_object> buf(Nsimd);
+      for (int m = 0; m < pole_sites; m++) {  // Draw from same generator multiplicity times                                                                                                           
+        for (int si = 0; si < Nsimd; si++) {
+          int gdx = 0;
+	  scalar_type *pointer = (scalar_type *)&buf[si];
+          dist[gdx].reset();
+          for (int idx = 0; idx < words; idx++)
+            fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
+        }
+        merge(l_v[pole_base+m], buf);
+      }      
+    }

    _time_counter += usecond()- inner_time_counter;
  }
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -466,6 +466,12 @@ public:
    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
+#ifndef ACCELERATOR_AWARE_MPI
+    static hostVector<vobj> hsend_buf; 
+    static hostVector<vobj> hrecv_buf;
+    hsend_buf.resize(buffer_size*2*depth);    
+    hrecv_buf.resize(buffer_size*2*depth);
+#endif    

    std::vector<MpiCommsRequest_t> fwd_req;   
    std::vector<MpiCommsRequest_t> bwd_req;   
@@ -495,9 +501,16 @@ public:
      t_gather+=usecond()-t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
+				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
      t_gather+= usecond() - t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#endif      
      t_comms+=usecond()-t;
    }

@@ -533,8 +553,13 @@ public:

    t=usecond();
    grid->CommsComplete(fwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
-
+    
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@@ -543,6 +568,11 @@ public:

    t=usecond();
    grid->CommsComplete(bwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
    
    t=usecond();
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -49,7 +49,7 @@ static constexpr int Tm = 7;

 static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
-static constexpr int Nd=4;
+static constexpr int Nd=Config_Nd;
 static constexpr int Nhs=2; // half spinor
 static constexpr int Nds=8; // double stored gauge field
 static constexpr int Ngp=2; // gparity index range
@@ -75,6 +75,7 @@ static constexpr int InverseYes=1;
 //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

 const int SpinorIndex = 2;
+const int PauliIndex  = 2; //TensorLevel counts from the bottom!
 template<typename T> struct isSpinor {
  static constexpr bool value = (SpinorIndex==T::TensorLevel);
 };
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
@@ -0,0 +1,196 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+
+    Copyright (C) 2020 - 2025
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
+
+template<class Impl, class CloverHelpers>
+class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
+				     public WilsonCloverHelpers<Impl>,
+				     public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion5D<Impl>            WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion5D(GaugeField& _Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       const RealD _mass,
+			       const RealD _csw_r = 0.0,
+			       const RealD _csw_t = 0.0,
+			       const RealD _cF = 1.0,
+			       const ImplParams& impl_p = ImplParams());
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
+
+  void M(const FermionField& in, FermionField& out) override;
+
+  void Mdag(const FermionField& in, FermionField& out) override;
+
+  void Meooe(const FermionField& in, FermionField& out) override;
+
+  void MeooeDag(const FermionField& in, FermionField& out) override;
+
+  void Mooee(const FermionField& in, FermionField& out) override;
+
+  void MooeeDag(const FermionField& in, FermionField& out) override;
+
+  void MooeeInv(const FermionField& in, FermionField& out) override;
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override;
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle);
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override;
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+  int n_rhs;
+  
+  bool fixedBoundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -123,10 +123,10 @@ public:
      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
      
      peekLocalSite(ScalarUmu, Umu_v, lcoor);
-      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
      
      peekLocalSite(ScalarUmu, Uadj_v, lcoor);
-      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
      
      pokeLocalSite(ScalarUds, Uds_v, lcoor);
    });
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -84,6 +85,15 @@ NAMESPACE_CHECK(DomainWall);
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 NAMESPACE_CHECK(Overlap);
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Two spin wilson fermion based
+///////////////////////////////////////////////////////////////////////////////
+
+#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
+NAMESPACE_CHECK(TwoSpinWilson);
+
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
@@ -164,12 +174,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS

 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;

 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;

+typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
+typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
+typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
+
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -41,8 +41,9 @@ NAMESPACE_CHECK(Compressor);
 NAMESPACE_CHECK(FermionOperatorImpl);
 #include <Grid/qcd/action/fermion/FermionOperator.h>
 NAMESPACE_CHECK(FermionOperator);
-#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonKernels.h>           //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h>    //used for 3D fermions, pauli in place of Dirac
 NAMESPACE_CHECK(Kernels);

 #endif
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -180,6 +180,12 @@ NAMESPACE_CHECK(ImplGparityWilson);
 #include <Grid/qcd/action/fermion/StaggeredImpl.h> 
 NAMESPACE_CHECK(ImplStaggered);  

+/////////////////////////////////////////////////////////////////////////////
+// Two component spinor Wilson action for 3d / Boston
+/////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h> 
+NAMESPACE_CHECK(ImplTwoSpinWilson);  
+
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -274,7 +274,7 @@ public:
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
-	    Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
+	    Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
 	  });
      }
      Utmp = Uconj;
@@ -286,7 +286,7 @@ public:
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
-	    Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
+	    Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
        });
      }
    }
@@ -320,7 +320,7 @@ public:
      }
      
      Uconj = conjugate(*Upoke);
-      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
    }
  }
      
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -36,6 +36,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 16;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template <class Impl>
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -40,6 +40,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  const int npoint = 16;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template<class Impl>
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -36,6 +36,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 8;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template <class Impl>
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -141,9 +141,9 @@ public:
      Udag = Udag *phases;

 	InsertGaugeField(Uds,U,mu);
-	InsertGaugeField(Uds,Udag,mu+4);
+	InsertGaugeField(Uds,Udag,mu+Nd);
 	//	PokeIndex<LorentzIndex>(Uds, U, mu);
-	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
+	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);

      // 3 hop based on thin links. Crazy huh ?
      U  = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -156,7 +156,7 @@ public:
      UUUdag = UUUdag *phases;

 	InsertGaugeField(UUUds,UUU,mu);
-	InsertGaugeField(UUUds,UUUdag,mu+4);
+	InsertGaugeField(UUUds,UUUdag,mu+Nd);

    }
  }
--- a/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
@@ -0,0 +1,175 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma one 
+
+NAMESPACE_BEGIN(Grid);
+
+class TwoSpinWilsonFermion3plus1DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static constexpr int npoint = 6;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
+};
+
+template<class Impl>
+class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef TwoSpinWilsonKernels<Impl> Kernels;
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  int Dirichlet;
+  Coordinate Block; 
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _ThreeDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _ThreeDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FourDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FourDimRedBlackGrid;}
+
+  // full checkerboard operations; leave unimplemented as abstract for now
+  virtual void   M    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
+
+  // half checkerboard operations; leave unimplemented as abstract for now
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+  // These can be overridden by fancy 5d chiral action
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+  //  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  // Implement s-diagonal DW
+  void DW    (const FermionField &in, FermionField &out,int dag);
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);
+
+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
+  // add a DhopComm
+  // -- suboptimal interface will presently trigger multiple comms.
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
+    
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
+    
+  void DhopInternal(StencilImpl & st,
+		    DoubledGaugeField &U,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);
+
+  void DhopInternalOverlappedComms(StencilImpl & st,
+				   DoubledGaugeField &U,
+				   const FermionField &in, 
+				   FermionField &out,
+				   int dag);
+
+  void DhopInternalSerialComms(StencilImpl & st,
+			       DoubledGaugeField &U,
+			       const FermionField &in, 
+			       FermionField &out,
+			       int dag);
+    
+  // Constructors
+  TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  GridCartesian         &ThreeDimGrid,
+		  GridRedBlackCartesian &ThreeDimRedBlackGrid,
+		  double _M5,const ImplParams &p= ImplParams());
+
+  virtual void DirichletBlock(const Coordinate & block)
+  {
+  }
+    
+  // DoubleStore
+  void ImportGauge(const GaugeField &_Umu);
+    
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
+    
+  // Add these to the support from Wilson
+  GridBase *_ThreeDimGrid;
+  GridBase *_ThreeDimRedBlackGrid;
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+    
+  double                        M5;
+  int Ls;
+    
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
+    
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
@@ -0,0 +1,222 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+  
+/////////////////////////////////////////////////////////////////////////////
+// Single flavour four spinors with colour index
+/////////////////////////////////////////////////////////////////////////////
+template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
+class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+public:
+  
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef typename Options::_Coeff_t Coeff_t;
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<Simd>    SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SitePropagator>        PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef const typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+
+  TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
+  };
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu) 
+  {
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
+  }
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu,
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    multLink(phi,U,chi,mu);
+  }
+
+  template<class _SpinorField> 
+  inline void multLinkField(_SpinorField & out,
+			    const DoubledGaugeField &Umu,
+			    const _SpinorField & phi,
+			    int mu)
+  {
+    const int Nsimd = SiteHalfSpinor::Nsimd();
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( phi_v, phi, AcceleratorRead);
+    autoView( Umu_v, Umu, AcceleratorRead);
+    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+	calcSpinor tmp;
+	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
+	coalescedWrite(out_v[sss],tmp);
+    });
+  }
+					   
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Umu) 
+  {
+    typedef typename Simd::scalar_type scalar_type;
+
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Umu.Grid(), GaugeGrid);
+
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField tmp(GaugeGrid);
+
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
+    for (int mu = 0; mu < Nd; mu++) {
+
+	////////// boundary phase /////////////
+      auto pha = Params.boundary_phases[mu];
+      scalar_type phase( real(pha),imag(pha) );
+
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
+      tmp = where(coor == Lmu, phase * U, U);
+      PokeIndex<LorentzIndex>(Uds, tmp, mu);
+
+      U = adj(Cshift(U, mu, -1));
+      U = where(coor == 0, conjugate(phase) * U, U); 
+      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
+      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    {
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+  {
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    autoView( mat_v , mat, AcceleratorWrite);
+    {
+      const int Nsimd = SiteSpinor::Nsimd();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
+	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+	  zeroit(sum);  
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
+	      auto op = outerProduct(bb,aa);
+  	      sum = sum + op;
+	    }
+	  }
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+      });
+    }
+  }
+};
+
+
+typedef TwoSpinWilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR;  // Real.. whichever prec
+typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF;  // Float
+typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD;  // Double
+typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
@@ -0,0 +1,84 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper routines that implement Wilson stencil for a single site.
+// Common to both the WilsonFermion and WilsonFermion5D
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl>  { 
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
+public:
+
+  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 int interior=1,int exterior=1) ;
+
+  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
+  static void DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out,
+			    int interior=1,int exterior=1) ;
+
+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
+  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
+
+private:
+
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
+				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
+
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+
+ public:
+  TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
+};
+    
+NAMESPACE_END(Grid);
+
+
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -484,6 +484,11 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
+#ifdef NVLINK_GET
+    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
  }

 };
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -38,6 +38,8 @@ public:
  static int MortonOrder;
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
  static const int npoint = 8;
 };

--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -62,6 +62,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static constexpr int npoint = 8;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template<class Impl>
@@ -91,13 +93,13 @@ public:
  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};

  // half checkerboard operations; leave unimplemented as abstract for now
-  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);

-  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -166,7 +166,7 @@ public:

      U = adj(Cshift(U, mu, -1));
      U = where(coor == 0, conjugate(phase) * U, U); 
-      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
    }
  }

--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
 			Frbgrid,
 			Ugrid,
 			Urbgrid,
-			4.0,p)
+			Nd*1.0,p)
   
    {
      update(_mass,_mu);
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    out.Checkerboard() = in.Checkerboard();
    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = 4.0+this->mass[s];
+      ComplexD a = Nd*1.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(out,a,in,b,in,s,s);
    }
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  virtual void MooeeDag(const FermionField &in, FermionField &out) {
    out.Checkerboard() = in.Checkerboard();
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = 4.0+this->mass[s];
+      ComplexD a = Nd*1.0+this->mass[s];
      ComplexD b(0.0,-this->mu[s]);
      axpbg5y_ssp(out,a,in,b,in,s,s);
    }
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    for (int s=0;s<(int)this->mass.size();s++) {
      RealD m    = this->mass[s];
      RealD tm   = this->mu[s];
-      RealD mtil = 4.0+this->mass[s];
+      RealD mtil = Nd*1.0+this->mass[s];
      RealD sq   = mtil*mtil+tm*tm;
      ComplexD a    = mtil/sq;
      ComplexD b(0.0, -tm /sq);
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    for (int s=0;s<(int)this->mass.size();s++) {
      RealD m    = this->mass[s];
      RealD tm   = this->mu[s];
-      RealD mtil = 4.0+this->mass[s];
+      RealD mtil = Nd*1.0+this->mass[s];
      RealD sq   = mtil*mtil+tm*tm;
      ComplexD a    = mtil/sq;
      ComplexD b(0.0,tm /sq);
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    this->Dhop(in, out, DaggerNo);
    FermionField tmp(out.Grid());
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = 4.0+this->mass[s];
+      ComplexD a = Nd*1.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(tmp,a,in,b,in,s,s);
    }
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
@@ -0,0 +1,376 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+template<class Impl, class CloverHelpers>
+CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
+										GridCartesian         &FiveDimGrid,
+										GridRedBlackCartesian &FiveDimRedBlackGrid,
+										GridCartesian         &FourDimGrid,
+										GridRedBlackCartesian &FourDimRedBlackGrid,
+										const RealD _mass,
+										const RealD _csw_r,
+										const RealD _csw_t,
+										const RealD _cF,
+										const ImplParams& impl_p)
+  : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
+  , csw_r(_csw_r)
+  , csw_t(_csw_t)
+  , cF(_cF)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , Diagonal(&FourDimGrid),        Triangle(&FourDimGrid)
+  , DiagonalEven(&FourDimRedBlackGrid),    TriangleEven(&FourDimRedBlackGrid)
+  , DiagonalOdd(&FourDimRedBlackGrid),     TriangleOdd(&FourDimRedBlackGrid)
+  , DiagonalInv(&FourDimGrid),     TriangleInv(&FourDimGrid)
+  , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
+  , DiagonalInvOdd(&FourDimRedBlackGrid),  TriangleInvOdd(&FourDimRedBlackGrid)
+  , Tmp(&FiveDimGrid)
+  , BoundaryMask(&FiveDimGrid)
+  , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
+{
+  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+
+  csw_r *= 0.5;
+  csw_t *= 0.5;
+  //if (clover_anisotropy.isAnisotropic)
+  //  csw_r /= clover_anisotropy.xi_0;
+
+  ImportGauge(_Umu);
+  if (fixedBoundaries) {
+    this->BoundaryMaskEven.Checkerboard() = Even;
+    this->BoundaryMaskOdd.Checkerboard() = Odd;
+    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::Dhop(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopOE(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopEO(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+  WilsonBase::DhopDir(in, out, dir, disp);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+  WilsonBase::DhopDirAll(in, out);
+  if(this->fixedBoundaries) {
+    for(auto& o : out) ApplyBoundaryMask(o);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+  Mooee(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+  MooeeDag(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+  WilsonBase::Meooe(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+  WilsonBase::MeooeDag(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalEven, TriangleEven);
+    }
+  } else {
+    MooeeInternal(in, out, Diagonal, Triangle);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+  Mooee(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+    }
+  } else {
+    MooeeInternal(in, out, DiagonalInv, TriangleInv);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+  MooeeInv(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+  DhopDirAll(in, out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
+
+  // NOTE: code copied from original clover term
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+        continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+								      FermionField&              out,
+								      const CloverDiagonalField& diagonal,
+								      const CloverTriangleField& triangle) {
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  out.Checkerboard() = in.Checkerboard();
+  conformable(in, out);
+  CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+  // NOTE: parts copied from original implementation
+
+  // Import gauge into base class
+  double t0 = usecond();
+  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+  // Initialize temporary variables
+  double t1 = usecond();
+  conformable(_Umu.Grid(), this->GaugeGrid());
+  GridBase* grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+  CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);
+
+  // Compute the field strength terms mu>nu
+  double t2 = usecond();
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  double t3 = usecond();
+  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
+  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Convert the data layout of the clover term
+  double t5 = usecond();
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
+  double t6 = usecond();
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
+  double t7 = usecond();
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
+
+  // Fill the remaining clover fields
+  double t8 = usecond();
+  pickCheckerboard(Even, DiagonalEven,    Diagonal);
+  pickCheckerboard(Even, TriangleEven,    Triangle);
+  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+  // Report timings
+  double t9 = usecond();
+
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@@ -240,7 +240,7 @@ void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::ve
  this->ceo.resize(Ls);

  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
+    this->bee[i] = Nd*1.0 - this->M5 + 1.0;
    this->cee[i] = 1.0;
  }

--- a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
@@ -0,0 +1,486 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion2plus1D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/perfmon/PerfCount.h>
+
+NAMESPACE_BEGIN(Grid);
+  
+  // 5d lattice for DWF.
+template<class Impl>
+TwoSpinWilsonFermion3plus15D<Impl>::TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
+								GridCartesian         &FourDimGrid,
+								GridRedBlackCartesian &FourDimRedBlackGrid,
+								GridCartesian         &ThreeDimGrid,
+								GridRedBlackCartesian &ThreeDimRedBlackGrid,
+               RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  _ThreeDimGrid        (&ThreeDimGrid),
+  _ThreeDimRedBlackGrid(&ThreeDimRedBlackGrid),
+  Stencil    (_FourDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(_FourDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (_FourDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
+  M5(_M5),
+  Umu(_ThreeDimGrid),
+  UmuEven(_ThreeDimRedBlackGrid),
+  UmuOdd (_ThreeDimRedBlackGrid),
+  _tmp(&FourDimRedBlackGrid),
+  Dirichlet(0)
+{
+  // some assertions
+  assert(FourDimGrid._ndimension==Nd+1);
+  assert(ThreeDimGrid._ndimension==Nd);
+  assert(ThreeDimRedBlackGrid._ndimension==Nd);
+  assert(FourDimRedBlackGrid._ndimension==Nd+1);
+  assert(FourDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+
+  // extent of fifth dim and not spread out
+  Ls=FourDimGrid._fdimensions[0];
+  assert(FourDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FourDimGrid._processors[0]         ==1);
+  assert(FourDimRedBlackGrid._processors[0] ==1);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<Nd;d++){
+
+    assert(FourDimGrid._processors[d+1]         ==ThreeDimGrid._processors[d]);
+    assert(FourDimRedBlackGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
+    assert(ThreeDimRedBlackGrid._processors[d]   ==ThreeDimGrid._processors[d]);
+
+    assert(FourDimGrid._fdimensions[d+1]        ==ThreeDimGrid._fdimensions[d]);
+    assert(FourDimRedBlackGrid._fdimensions[d+1]==ThreeDimGrid._fdimensions[d]);
+    assert(ThreeDimRedBlackGrid._fdimensions[d]  ==ThreeDimGrid._fdimensions[d]);
+
+    assert(FourDimGrid._simd_layout[d+1]        ==ThreeDimGrid._simd_layout[d]);
+    assert(FourDimRedBlackGrid._simd_layout[d+1]==ThreeDimGrid._simd_layout[d]);
+    assert(ThreeDimRedBlackGrid._simd_layout[d]  ==ThreeDimGrid._simd_layout[d]);
+  }
+
+  if ( p.dirichlet.size() == Nd+1) {
+    Coordinate block = p.dirichlet;
+    for(int d=0;d<Nd+1;d++) {
+      if ( block[d] ){
+	Dirichlet = 1;
+	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
+	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
+	Block = block;
+      }
+    }
+  } else {
+    Coordinate block(Nd+1,0);
+    Block = block;
+  }
+
+  // Dimension zero of the five-d is the Ls direction
+  assert(FourDimRedBlackGrid._simd_layout[0]==1);
+  assert(FourDimGrid._simd_layout[0]        ==1);
+    
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+  // Build lists of exterior only nodes
+  int LLs = FourDimGrid._rdimensions[0];
+  int vol3;
+  vol3=ThreeDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol3);
+
+  vol3=ThreeDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol3);
+   StencilOdd.BuildSurfaceList(LLs,vol3);
+
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  GaugeField HUmu(_Umu.Grid());
+  HUmu = _Umu*(-0.5);
+  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
+  pickCheckerboard(Even,UmuEven,Umu);
+  pickCheckerboard(Odd ,UmuOdd,Umu);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
+{
+  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+  //  assert( (disp==1)||(disp==-1) );
+  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
+
+  int skip = (disp==1) ? 0 : 1;
+  int dirdisp = dir+skip*Nd;
+  int gamma   = dir+(1-skip)*Nd;
+
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
+
+};
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
+};
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DerivInternal(StencilImpl & st,
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,
+					  int dag)
+{
+  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  conformable(st.Grid(),A.Grid());
+  conformable(st.Grid(),B.Grid());
+
+  Compressor compressor(dag);
+  
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+
+  st.HaloExchange(B,compressor);
+
+  Atilde=A;
+  int LLs = B.Grid()->_rdimensions[0];
+
+
+  for (int mu = 0; mu < Nd; mu++) {
+    ////////////////////////////////////////////////////////////////////////
+    // Flip gamma if dag
+    ////////////////////////////////////////////////////////////////////////
+    int gamma = mu;
+    if (!dag) gamma += Nd;
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+
+    int Usites = U.Grid()->oSites();
+
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
+
+    ////////////////////////////
+    // spin trace outer product
+    ////////////////////////////
+    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
+  }
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDeriv(GaugeField &mat,
+                                      const FermionField &A,
+                                      const FermionField &B,
+                                      int dag)
+{
+  conformable(A.Grid(),FermionGrid());  
+  conformable(A.Grid(),B.Grid());
+
+  //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
+
+  mat.Checkerboard() = A.Checkerboard();
+  //  mat.checkerboard = A.checkerboard;
+
+  DerivInternal(Stencil,Umu,mat,A,B,dag);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivEO(GaugeField &mat,
+                                        const FermionField &A,
+                                        const FermionField &B,
+                                        int dag)
+{
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());
+
+  assert(B.Checkerboard()==Odd);
+  assert(A.Checkerboard()==Even);
+  mat.Checkerboard() = Even;
+
+  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivOE(GaugeField &mat,
+                                        const FermionField &A,
+                                        const FermionField &B,
+                                        int dag)
+{
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());
+
+  assert(B.Checkerboard()==Even);
+  assert(A.Checkerboard()==Odd);
+  mat.Checkerboard() = Odd;
+
+  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternal(StencilImpl & st,
+                                         DoubledGaugeField & U,
+                                         const FermionField &in, FermionField &out,int dag)
+{
+  DhopInternalSerialComms(st,U,in,out,dag);
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
+							DoubledGaugeField & U,
+							const FermionField &in, FermionField &out,int dag)
+{
+  GRID_TRACE("DhopInternalOverlappedComms");
+  Compressor compressor(dag);
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();
+
+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  {
+    //    std::cout << " TwoSpinWilsonFermion3plus1D gather " <<std::endl;
+    GRID_TRACE("Gather");
+    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
+  }
+  
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Communicate Begin " <<std::endl;
+  std::vector<std::vector<CommsRequest_t> > requests;
+
+#if 1
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif
+
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
+    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    GRID_TRACE("DhopInterior");
+    Kernels::DhopKernel   (st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  }
+  
+  //ifdef GRID_ACCELERATED
+#if 0
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Comms Complete " <<std::endl;
+  st.CommunicateComplete(requests);
+  //  traceStop(id);
+
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  {
+    //    std::cout << " TwoSpinWilsonFermion3plus1D Comms Merge " <<std::endl;
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
+  
+
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Exterior " <<std::endl;
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
+  } else {
+    GRID_TRACE("DhopExterior");
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
+  }
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
+{
+  GRID_TRACE("DhopInternalSerialComms");
+  Compressor compressor(dag);
+
+  int LLs = in.Grid()->_rdimensions[0];
+
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Halo exch " <<std::endl;
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchangeOpt(in,compressor);
+  }
+  
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Dhop " <<std::endl;
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
+    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
+  } else {
+    GRID_TRACE("Dhop");
+    Kernels::DhopKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
+  }
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopComms(const FermionField &in, FermionField &out)
+{
+  int dag =0 ;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+  out.Checkerboard() = in.Checkerboard();
+  Compressor compressor(dag);
+  Stencil.HaloExchangeOpt(in,compressor);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  int LLs = in.Grid()->_rdimensions[0];
+  Kernels::DhopKernel(Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil,Umu,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
+{
+  out.Checkerboard()=in.Checkerboard();
+  Dhop(in,out,dag); // -0.5 is included
+  axpy(out,Nd*1.0-M5,in,out);
+}
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(Nd*1.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(Nd*1.0 + M5))*in;
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
+}
+  
+NAMESPACE_END(Grid);
+
+
+
+
--- a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
@@ -0,0 +1,441 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/TwoSpinWilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////
+
+#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else {							\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  acceleratorSynchronise();					\
+  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+  Recon(result, Uchi);
+
+#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);							\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
+    Recon(result, Uchi);						\
+  }									\
+  acceleratorSynchronise();
+
+#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (!SE->_is_local ) {		\
+    auto chi = coalescedRead(buf[SE->_offset],lane);		\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+    nmu++;							\
+  }								\
+  acceleratorSynchronise();
+
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
+    if (SE->_is_local ) {					\
+      int perm= SE->_permute;					\
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+      spProj(chi,tmp);						\
+    } else {							\
+      chi = coalescedRead(buf[SE->_offset],lane);		\
+    }								\
+    acceleratorSynchronise();					\
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
+    Recon(result, Uchi);
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
+  }
+
+////////////////////////////////////////////////////////////////////
+// All legs kernels ; comms then compute
+////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::DhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+					     SiteSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xp,pauliProjXp,pauliAssign);
+  GENERIC_STENCIL_LEG(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG(Zm,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF],result,lane);
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
+					  SiteSpinor *buf, int sF,
+					  int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xm,pauliProjXp,pauliAssign);
+  GENERIC_STENCIL_LEG(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG(Zp,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+  ////////////////////////////////////////////////////////////////////
+  // Interior kernels
+  ////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
+						       SiteSpinor *buf, int sF,
+						       int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
+						    SiteSpinor *buf, int sF,
+						    int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+////////////////////////////////////////////////////////////////////
+// Exterior kernels
+////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZm,pauliAdd);
+  if ( nmu ) {
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
+					     SiteSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZm,pauliAdd);
+  if ( nmu ) {
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl> accelerator_inline				\
+  void TwoSpinWilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcSpinor chi;							\
+  calcSpinor result;							\
+  calcSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteSpinor::Nsimd();				\
+  const int lane=acceleratorSIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }
+
+DhopDirMacro(Xp,pauliProjXp,pauliAssign);
+DhopDirMacro(Yp,pauliProjYp,pauliAssign);
+DhopDirMacro(Zp,pauliProjZp,pauliAssign);
+DhopDirMacro(Xm,pauliProjXm,pauliAssign);
+DhopDirMacro(Ym,pauliProjYm,pauliAssign);
+DhopDirMacro(Zm,pauliProjZm,pauliAssign);
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF,
+				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  calcSpinor result;
+  calcSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  SE = st.GetEntry(ptype, dir, sF);
+  GENERIC_DHOPDIR_LEG(Xp,pauliProjXp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Yp,pauliProjYp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Zp,pauliProjZp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Xm,pauliProjXm,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Ym,pauliProjYm,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Zm,pauliProjZm,pauliAssign);
+  coalescedWrite(out[sF], result,lane);
+}
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out)
+{
+   autoView(U_v  ,U,AcceleratorRead);
+   autoView(in_v ,in,AcceleratorRead);
+   autoView(st_v ,st,AcceleratorRead);
+
+   autoView(out_Xm,out[0],AcceleratorWrite);
+   autoView(out_Ym,out[1],AcceleratorWrite);
+   autoView(out_Zm,out[2],AcceleratorWrite);
+   autoView(out_Xp,out[4],AcceleratorWrite);
+   autoView(out_Yp,out[5],AcceleratorWrite);
+   autoView(out_Zp,out[6],AcceleratorWrite);
+   auto CBp=st.CommBuf();
+   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;
+      int sF =sss;
+      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
+      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,3);
+      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,4);
+      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,5);
+   });
+}
+
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
+{
+  assert(dirdisp<=5);
+  assert(dirdisp>=0);
+
+   autoView(U_v  ,U  ,AcceleratorRead);
+   autoView(in_v ,in ,AcceleratorRead);
+   autoView(out_v,out,AcceleratorWrite);
+   autoView(st_v ,st ,AcceleratorRead);
+   auto CBp=st.CommBuf();
+#define LoopBody(Dir)				\
+   case Dir :					\
+     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+     break;
+
+   switch(gamma){
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   default:
+     assert(0);
+     break;
+   }
+#undef LoopBody
+}
+
+
+#define KERNEL_CALLNB(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });
+
+#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
+
+#define KERNEL_CALL_EXT(A)						\
+  const uint64_t    sz = st.surface_list.size();			\
+  auto ptr = &st.surface_list[0];					\
+  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
+      int sF = ptr[ss];							\
+      int sU = sF/Ls;							\
+      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });									\
+  accelerator_barrier();
+
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+					    int Ls, int Nsite, const FermionField &in, FermionField &out,
+					    int interior,int exterior)
+{
+  autoView(U_v  ,  U,AcceleratorRead);
+  autoView(in_v , in,AcceleratorRead);
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(st_v , st,AcceleratorRead);
+  
+  if( interior && exterior ) {
+    acceleratorFenceComputeStream();
+    KERNEL_CALL(GenericDhopSite);
+    return;
+  } else if( interior ) {
+    KERNEL_CALLNB(GenericDhopSiteInt);
+    return;
+  } else if( exterior ) {
+    //     // dependent on result of merge
+    acceleratorFenceComputeStream();
+    KERNEL_CALL_EXT(GenericDhopSiteExt);
+    return;
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+					       int Ls, int Nsite, const FermionField &in, FermionField &out,
+					       int interior,int exterior)
+{
+  autoView(U_v  ,U,AcceleratorRead);
+  autoView(in_v ,in,AcceleratorRead);
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(st_v ,st,AcceleratorRead);
+  
+  if( interior && exterior ) {
+    acceleratorFenceComputeStream();
+    KERNEL_CALL(GenericDhopSiteDag);
+    return;
+  } else if( interior ) {
+    KERNEL_CALLNB(GenericDhopSiteDagInt); return;
+  } else if( exterior ) {
+    // Dependent on result of merge
+    acceleratorFenceComputeStream();
+    KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+
+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -61,7 +61,7 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
-    diag_mass = 4.0 + _mass;
+    diag_mass = Nd*1.0 + _mass;
  }
  csw_t = _csw_t * 0.5;

@@ -297,9 +297,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
    {
      if (mu == nu)
      continue;
-
+      
      RealD factor;
-      if (nu == 4 || mu == 4)
+      if (nu == (Nd-1) || mu == (Nd-1)) // This was a bug - surely mu/nu is NEVER 4 but rather (Nd-1)=3 ??
      {
        factor = 2.0 * csw_t;
      }
@@ -307,9 +307,11 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
      {
        factor = 2.0 * csw_r;
      }
-      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
-      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
+      if ( mu < Nd && nu < Nd ) { // Allow to restrict range to Nd=3, but preserve orders of SigmaMuNu in table by counting ALL
+	PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+	Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+	force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
+      }
      count++;
    }

--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -62,10 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Dirichlet(0)
 {
  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimGrid._ndimension==Nd+1);
+  assert(FourDimGrid._ndimension==Nd);
+  assert(FourDimRedBlackGrid._ndimension==Nd);
+  assert(FiveDimRedBlackGrid._ndimension==Nd+1);
  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction

  // extent of fifth dim and not spread out
@@ -75,7 +76,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  assert(FiveDimRedBlackGrid._processors[0] ==1);

  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
+  for(int d=0;d<Nd;d++){

    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
@@ -92,11 +93,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,

  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
-    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
-      Dirichlet = 1;
-      std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
-      std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
-      Block = block;
+    for(int d=0;d<Nd+1;d++) {
+      if ( block[d] ){
+	Dirichlet = 1;
+	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
+	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
+	Block = block;
+      }
    }
  } else {
    Coordinate block(Nd+1,0);
@@ -111,7 +114,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

-    for(int d=0;d<4;d++){
+    for(int d=0;d<Nd;d++){
      assert(FourDimGrid._simd_layout[d]==1);
      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
@@ -182,8 +185,8 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

  int skip = (disp==1) ? 0 : 1;
-  int dirdisp = dir+skip*4;
-  int gamma   = dir+(1-skip)*4;
+  int dirdisp = dir+skip*Nd;
+  int gamma   = dir+(1-skip)*Nd;

  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in,compressor);
@@ -482,7 +485,55 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 {
  out.Checkerboard()=in.Checkerboard();
  Dhop(in,out,dag); // -0.5 is included
-  axpy(out,4.0-M5,in,out);
+  axpy(out,Nd*1.0-M5,in,out);
+}
+template <class Impl>
+void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(Nd*1.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(Nd*1.0 + M5))*in;
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
 }

 template<class Impl>
@@ -586,7 +637,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
-  F = F - abs(W) * sinha * 4.0 * mass;
+  F = F - abs(W) * sinha * (Nd* 1.0) * mass;

  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -63,7 +63,7 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  if  (anisotropyCoeff.isAnisotropic){
    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
  } else {
-    diag_mass = 4.0 + mass;
+    diag_mass = Nd*1.0 + mass;
  }

  int vol4;
@@ -354,8 +354,8 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
  Stencil.HaloExchange(in, compressor);

  int skip = (disp == 1) ? 0 : 1;
-  int dirdisp = dir + skip * 4;
-  int gamma = dir + (1 - skip) * 4;
+  int dirdisp = dir + skip * Nd;
+  int gamma = dir + (1 - skip) * Nd;

  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
@@ -370,8 +370,8 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
    for(int disp=-1;disp<=1;disp+=2){

      int skip = (disp == 1) ? 0 : 1;
-      int dirdisp = dir + skip * 4;
-      int gamma = dir + (1 - skip) * 4;
+      int dirdisp = dir + skip * Nd;
+      int gamma = dir + (1 - skip) * Nd;

      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
    }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -97,7 +97,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  distance = st._distances[DIR];				\
  sl = st._simd_layout[direction];			        \
  inplace_twist = 0;						\
-  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
+  if(SE->_around_the_world && st.parameters.twists[DIR % Nd]){		\
    if(sl == 1){							\
      g = (F+1) % 2;							\
    }else{								\
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();						\
+  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);

@@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
@@ -0,0 +1,45 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
@@ -32,8 +32,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
  
 // S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+const std::vector<int> ImprovedStaggeredFermion5DStatic::directions(ImprovedStaggeredFermion5DStatic::MakeDirections());
+const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements(ImprovedStaggeredFermion5DStatic::MakeDisplacements());
+std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDirections(void)
+{
+  std::vector<int> directions(4*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d+Nd*0] = d+1;
+    directions[d+Nd*1] = d+1;
+    directions[d+Nd*2] = d+1;
+    directions[d+Nd*3] = d+1;
+  }
+  return directions;
+}
+std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(4*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d+Nd*0] =+1;
+    displacements[d+Nd*1] =-1;
+    displacements[d+Nd*2] =+3;
+    displacements[d+Nd*3] =-3;
+  }
+  return displacements;
+}
  
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -32,5 +32,26 @@ NAMESPACE_BEGIN(Grid);

 const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
 const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-
+std::vector<int> ImprovedStaggeredFermionStatic::MakeDirections(void)
+{
+  std::vector<int> directions(4*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d+Nd*0] = d;
+    directions[d+Nd*1] = d;
+    directions[d+Nd*2] = d;
+    directions[d+Nd*3] = d;
+  }
+  return directions;
+}
+std::vector<int> ImprovedStaggeredFermionStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(4*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d+Nd*0] =+1;
+    displacements[d+Nd*1] =-1;
+    displacements[d+Nd*2] =+3;
+    displacements[d+Nd*3] =-3;
+  }
+  return displacements;
+}
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
@@ -30,7 +30,27 @@ directory

 NAMESPACE_BEGIN(Grid);

-const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+//const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+//const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+const std::vector<int> NaiveStaggeredFermionStatic::directions(NaiveStaggeredFermionStatic::MakeDirections());
+const std::vector<int> NaiveStaggeredFermionStatic::displacements(NaiveStaggeredFermionStatic::MakeDisplacements());
+std::vector<int> NaiveStaggeredFermionStatic::MakeDirections(void)
+{
+  std::vector<int> directions(4*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d+Nd*0] = d;
+    directions[d+Nd*1] = d;
+  }
+  return directions;
+}
+std::vector<int> NaiveStaggeredFermionStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(4*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d+Nd*0] =+1;
+    displacements[d+Nd*1] =-1;
+  }
+  return displacements;
+}

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
@@ -0,0 +1,61 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// S-direction is INNERMOST and takes no part in the parity.
+
+const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::directions   (TwoSpinWilsonFermion3plus1DStatic::MakeDirections());
+const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::displacements(TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements());
+
+std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d+1;
+    directions[d+Nd] = d+1;
+  }
+  return directions;
+}
+std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class TwoSpinWilsonFermion3plus1D<IMPLEMENTATION>; 
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class TwoSpinWilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
@@ -34,8 +34,28 @@ directory
 NAMESPACE_BEGIN(Grid);

 // S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
-const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
+
+const std::vector<int> WilsonFermion5DStatic::directions   (WilsonFermion5DStatic::MakeDirections());
+const std::vector<int> WilsonFermion5DStatic::displacements(WilsonFermion5DStatic::MakeDisplacements());
+
+std::vector<int> WilsonFermion5DStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d+1;
+    directions[d+Nd] = d+1;
+  }
+  return directions;
+}
+std::vector<int> WilsonFermion5DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
@@ -33,9 +33,27 @@ directory

 NAMESPACE_BEGIN(Grid);

-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+const std::vector<int> WilsonFermionStatic::directions(WilsonFermionStatic::MakeDirections());
+const std::vector<int> WilsonFermionStatic::displacements(WilsonFermionStatic::MakeDisplacements());
 int WilsonFermionStatic::HandOptDslash;
+std::vector<int> WilsonFermionStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d;
+    directions[d+Nd] = d;
+  }
+  return directions;
+}
+std::vector<int> WilsonFermionStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
@@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
@@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -36,11 +36,16 @@ DWF_IMPL_LIST=" \
 	   ZWilsonImplF \
 	   ZWilsonImplD2 "

+TWOSPIN_WILSON_IMPL_LIST=" \
+	   TwoSpinWilsonImplF \
+	   TwoSpinWilsonImplD "
+
+
 GDWF_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "

-IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
+IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST $TWOSPIN_WILSON_IMPL_LIST"

 for impl in $IMPL_LIST
 do
@@ -62,7 +67,7 @@ do
 done
 done

-CC_LIST="CompactWilsonCloverFermionInstantiation"
+CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"

 for impl in $COMPACT_WILSON_IMPL_LIST
 do
@@ -110,7 +115,12 @@ do
 done
 done

-CC_LIST=" \
-  ImprovedStaggeredFermion5DInstantiation \
-  StaggeredKernelsInstantiation "
+CC_LIST="TwoSpinWilsonFermion3plus1DInstantiation.cc.master	TwoSpinWilsonKernelsInstantiation.cc.master"

+for impl in $TWOSPIN_WILSON_IMPL_LIST
+do
+for f in $CC_LIST
+do
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+done
+done
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -76,27 +76,27 @@ public:
    return action;
  };

-  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    //extend Ta to include Lorentz indexes
    RealD factor_p = c_plaq/RealD(Nc)*0.5;
    RealD factor_r = c_rect/RealD(Nc)*0.5;

-    GridBase *grid = Umu.Grid();
+    GridBase *grid = U.Grid();

-    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> Umu (Nd,grid);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
    }
    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
-    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
-	  
+      dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
+      dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }

--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -73,20 +73,23 @@ public:
    // extend Ta to include Lorentz indexes

    RealD factor = 0.5 * beta / RealD(Nc);
+    GridBase *grid = U.Grid();

-    GaugeLinkField Umu(U.Grid());
-    GaugeLinkField dSdU_mu(U.Grid());
+    GaugeLinkField dSdU_mu(grid);
+    std::vector<GaugeLinkField> Umu(Nd, grid);
    for (int mu = 0; mu < Nd; mu++) {
+      Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
+    }

-      Umu = PeekIndex<LorentzIndex>(U, mu);
-      
+    for (int mu = 0; mu < Nd; mu++) {
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-      
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
+      dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
  }
+
 private:
  RealD beta;  
 };
--- a/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@@ -111,8 +111,8 @@ public:
  };

  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -75,7 +75,7 @@ public:
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng, smr;
-      this->build_filenames(traj, Params, config, rng);
+      this->build_filenames(traj, Params, config, smr, rng);
      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@@ -102,7 +102,7 @@ public:
      if ( Params.saveSmeared ) { 
 	IldgWriter _IldgWriter(grid->IsBoss());
 	_IldgWriter.open(smr);
-	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
+	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
 	_IldgWriter.close();

 	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
@@ -118,8 +118,8 @@ public:

  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {

  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {


 /*!  @brief structure holding the link treatment */
-struct SmearingParameters{
-    SmearingParameters(){}
+struct HISQSmearingParameters{
+    HISQSmearingParameters(){}
    Real c_1;               // 1 link
    Real c_naik;            // Naik term
    Real c_3;               // 3 link
    Real c_5;               // 5 link
    Real c_7;               // 7 link
    Real c_lp;              // 5 link Lepage
-    SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
+    HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
        : c_1(c1),
          c_naik(cnaik),
          c_3(c3),
@@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {

 private:
    GridCartesian* const _grid;
-    SmearingParameters _linkTreatment;
+    HISQSmearingParameters _linkTreatment;

 public:

@@ -117,7 +117,7 @@ public:
    //          IN--u_thin
    void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {

-        SmearingParameters lt = this->_linkTreatment;
+        HISQSmearingParameters lt = this->_linkTreatment;
        auto grid = this->_grid;

        // Create a padded cell of extra padding depth=1 and fill the padding.
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -158,8 +158,8 @@ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeF
  LatticeComplexD R(U.Grid());
  R = Zero();
  
-  for(int mu=0;mu<3;mu++){
-    for(int nu=mu+1;nu<4;nu++){
+  for(int mu=0;mu<Nd-1;mu++){
+    for(int nu=mu+1;nu<Nd;nu++){
      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
      R = R + trace(F*F);
    }
@@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
 }

 template <class Gimpl>
-void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
-  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
    });
-  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : "  << step << "  " << t << "  " << energyDensityCloverleaf(t,U) << std::endl;
+    });
+  addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
    });
 }
@@ -249,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{

  out = in;
  RealD taus = 0.;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(0,taus,out);
+  
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@@ -333,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(step,taus,out);
+  
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/qcd/spin/Pauli.h
+++ b/Grid/qcd/spin/Pauli.h
@@ -0,0 +1,220 @@
+#ifndef GRID_QCD_PAULI_H
+#define GRID_QCD_PAULI_H
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+//
+/*
+ * Pauli basis
+ * sx        sy       sz       ident
+ * (0 1)  , (0 -i) , ( 1 0 )
+ * (1 0)    (i  0)   ( 0 -1)
+ *
+ * These are hermitian.
+ *
+ * Also supply wilson "projectors" (1+/-sx), (1+/-sy), (1+/-sz)
+ *
+ * spPauliProjXm
+ * spPauliProjYm etc...
+ */
+class Pauli {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX           , 0,
+                           MinusSigmaX      , 1,
+                           SigmaY           , 2,
+                           MinusSigmaY      , 3,
+                           SigmaZ           , 4,
+                           MinusSigmaZ      , 5,
+                           Identity         , 6,
+			   MinusIdentity    , 7);
+  
+    static constexpr unsigned int nPauli = 8;
+    static const std::array<const char *, nPauli>                name;
+    static const std::array<std::array<Algebra, nPauli>, nPauli> mul;
+    static const std::array<Algebra, nPauli>                     adj;
+    static const std::array<const Pauli, 4>                      gmu;
+    static const std::array<const Pauli, 16>                     gall;
+    Algebra                                                      g;
+  public:
+  accelerator Pauli(Algebra initg): g(initg) {}  
+};
+
+#define CopyImplementation(iTemplate,multPauli,multFlavour)	\
+  template<class vtype>							\
+  accelerator_inline void multPauli(iTemplate<vtype, Nhs> &ret, const iTemplate<vtype, Nhs> &rhs) {	\
+    multFlavour(ret,rhs);						\
+}
+
+CopyImplementation(iVector,multPauliSigmaX,multFlavourSigmaX);
+CopyImplementation(iMatrix,lmultPauliSigmaX,lmultFlavourSigmaX);
+CopyImplementation(iMatrix,rmultPauliSigmaX,rmultFlavourSigmaX);
+
+CopyImplementation(iVector,multPauliMinusSigmaX ,multFlavourMinusSigmaX);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaX,lmultFlavourMinusSigmaX);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaX,rmultFlavourMinusSigmaX);
+
+CopyImplementation(iVector,multPauliSigmaY,multFlavourSigmaY);
+CopyImplementation(iMatrix,lmultPauliSigmaY,lmultFlavourSigmaY);
+CopyImplementation(iMatrix,rmultPauliSigmaY,rmultFlavourSigmaY);
+
+CopyImplementation(iVector,multPauliMinusSigmaY ,multFlavourMinusSigmaY);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaY,lmultFlavourMinusSigmaY);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaY,rmultFlavourMinusSigmaY);
+
+CopyImplementation(iVector,multPauliSigmaZ,multFlavourSigmaZ);
+CopyImplementation(iMatrix,lmultPauliSigmaZ,lmultFlavourSigmaZ);
+CopyImplementation(iMatrix,rmultPauliSigmaZ,rmultFlavourSigmaZ);
+
+CopyImplementation(iVector,multPauliMinusSigmaZ ,multFlavourMinusSigmaZ);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaZ,lmultFlavourMinusSigmaZ);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaZ,rmultFlavourMinusSigmaZ);
+
+CopyImplementation(iVector,multPauliIdentity,multFlavourIdentity);
+CopyImplementation(iMatrix,lmultPauliIdentity,lmultFlavourIdentity);
+CopyImplementation(iMatrix,rmultPauliIdentity,rmultFlavourIdentity);
+
+CopyImplementation(iVector,multPauliMinusIdentity ,multFlavourMinusIdentity);
+CopyImplementation(iMatrix,lmultPauliMinusIdentity,lmultFlavourMinusIdentity);
+CopyImplementation(iMatrix,rmultPauliMinusIdentity,rmultFlavourMinusIdentity);
+
+/*
+ * sx        sy       sz       ident
+ * (0 1)  , (0 -i) , ( 1 0 )
+ * (1 0)    (i  0)   ( 0 -1)
+ */
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+fspin(1);
+  hspin(1)=fspin(1)+fspin(0);
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)-fspin(1);
+  hspin(1)=fspin(1)-fspin(0);
+}
+
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)-timesI(fspin(1));
+  hspin(1)=fspin(1)+timesI(fspin(0));
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+timesI(fspin(1));
+  hspin(1)=fspin(1)-timesI(fspin(0));
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+fspin(0);
+  hspin(1)=Zero();
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=Zero();
+  hspin(1)=fspin(1)+fspin(1);
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAssign(iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
+{
+  fspin = hspin;
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAdd   (iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
+{
+  fspin = fspin + hspin;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const Pauli &G, const iVector<vtype, Nhs> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Nhs>, PauliIndex>::value, iVector<vtype, Nhs>>::type
+{
+  iVector<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    multPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    multPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    multPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    multPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    multPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    multPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    multPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    multPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const Pauli &G, const iMatrix<vtype, Nhs> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
+{
+  iMatrix<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    lmultPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    lmultPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    lmultPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    lmultPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    lmultPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    lmultPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    lmultPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    lmultPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Nhs> &arg, const Pauli &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
+{
+  iMatrix<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    rmultPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    rmultPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    rmultPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    rmultPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    rmultPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    rmultPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    rmultPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    rmultPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+
+NAMESPACE_END(Grid);
+
+#endif // GRID_QCD_GAMMA_H
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -179,20 +179,17 @@ public:
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
-  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  
    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
-    int T = Umu.Grid()->GlobalDimensions()[3];
-    int X = Umu.Grid()->GlobalDimensions()[0];
-    int Y = Umu.Grid()->GlobalDimensions()[1];
-    int Z = Umu.Grid()->GlobalDimensions()[2];
-
-    Ut = peekLorentz(Umu,3); //Select temporal direction
+    uint64_t vol = Umu.Grid()->gSites();
+    int T = Umu.Grid()->GlobalDimensions()[Nd-1];
+    Ut = peekLorentz(Umu,Nd-1); //Select temporal direction
    P = Ut;
    for (int t=1;t<T;t++){ 
-      P = Gimpl::CovShiftForward(Ut,3,P);
+      P = Gimpl::CovShiftForward(Ut,Nd-1,P);
    }
-   RealD norm = 1.0/(Nc*X*Y*Z*T);
+   RealD norm = 1.0/(Nc*vol);
   out = sum(trace(P))*norm;
   return out;   
 }
@@ -215,7 +212,7 @@ public:

    double vol = Umu.Grid()->gSites();

-    return p.real() / vol / (4.0 * Nc ) ;
+    return p.real() / vol / (Nd * Nc ) ;
  };

  //////////////////////////////////////////////////
@@ -292,19 +289,21 @@ public:
  //////////////////////////////////////////////////
  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+  static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {

-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
+    std::vector<GaugeMat> Umu(Nd, U.Grid());
    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+      Umu[d] = PeekIndex<LorentzIndex>(U, d);
    }
-    Staple(staple, U, mu);
+    Staple(staple, Umu, mu);
  }

-  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
-    staple = Zero();
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
+
+    autoView(staple_v, staple, AcceleratorWrite);
+    accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
+        staple_v[i] = Zero();
+    });

    for (int nu = 0; nu < Nd; nu++) {

@@ -318,12 +317,12 @@ public:
        //      |
        //    __|
        //
-     
+
        staple += Gimpl::ShiftStaple(
 				     Gimpl::CovShiftForward(
-							    U[nu], nu,
+							    Umu[nu], nu,
 							    Gimpl::CovShiftBackward(
-										    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+										    Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
 				     mu);

        //  __
@@ -333,8 +332,8 @@ public:
        //

        staple += Gimpl::ShiftStaple(
-				     Gimpl::CovShiftBackward(U[nu], nu,
-							     Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+				     Gimpl::CovShiftBackward(Umu[nu], nu,
+							     Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
      }
    }
  }
@@ -738,6 +737,7 @@ public:
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
  //output is the charge by timeslice: sum over timeslices to obtain the total
  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
+    // Audit: 4D epsilon is hard coded
    assert(Nd == 4);
    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
    //Note F_numu = - F_munu
@@ -827,6 +827,25 @@ public:
    return out;
  }

+  //Compute the 5Li topological charge density
+  static std::vector<Real> TopologicalChargeDensity5Li(const GaugeLorentz &U){
+
+    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
+    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
+
+    double c5=1./20.;
+    double c4=1./5.-2.*c5;
+    double c3=(-64.+640.*c5)/45.;
+    double c2=(1-64.*c5)/9.;
+    double c1=(19.-55.*c5)/9.;
+
+    int Lt = loops[0].size();
+    std::vector<Real> out(Lt,0.);
+    for(int t=0;t<Lt;t++)
+      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
+    return out;
+  }
+  
  static Real TopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
    Real Q = 0.;
@@ -1453,7 +1472,7 @@ public:
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1472,7 +1491,7 @@ public:
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1490,8 +1509,8 @@ public:
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
+				   const int R1, const int R2) {
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -252,7 +252,7 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
 
 inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
  int nn=vComplexD::Nsimd();
-  std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
+  std::vector<ComplexD> buf(nn);
  vstore(o,&buf[0]);
  stream<<"<";
  for(int i=0;i<nn;i++){
@@ -272,7 +272,7 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexD2 &o){

 inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
  int nn=vRealF::Nsimd();
-  std::vector<RealF,alignedAllocator<RealF> > buf(nn);
+  std::vector<RealF> buf(nn);
  vstore(o,&buf[0]);
  stream<<"<";
  for(int i=0;i<nn;i++){
--- a/Grid/stencil/IcosahedralStencil.h
+++ b/Grid/stencil/IcosahedralStencil.h
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -363,12 +363,16 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Begin "<<std::endl;
+    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate prepare "<<i<<std::endl;
+      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
@@ -376,8 +380,15 @@ public:
 					  Packets[i].from_rank,Packets[i].do_recv,
 					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
+    //    std::cout << "Communicate PollDtoH "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
+    //    std::cout << "Communicate CopySynch "<<std::endl;
+    //    _grid->Barrier();
    acceleratorCopySynchronise();
+    // Starts intranode
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@@ -385,6 +396,7 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
+    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
@@ -395,7 +407,14 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Complete "<<std::endl;
+    //    _grid->Barrier();
    FlightRecorder::StepLog("Start communicate complete");
+    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
+    //    std::cout << "Communicate Complete Complete "<<std::endl;
+    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
@@ -428,6 +447,7 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
+    accelerator_barrier();
  }

  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -483,6 +503,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
+    //////////////////////////////////
+    // I will overwrite my send buffers
+    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@@ -496,7 +519,11 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
-    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
+#ifdef NVLINK_GET
+    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@@ -535,6 +562,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
+      // Also fenced in WilsonKernels
    }
  }
  
@@ -663,7 +691,7 @@ public:
 	}
      }
    }
-    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@@ -683,6 +711,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
+    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -774,8 +803,8 @@ public:
    this->_entries_host_p = &_entries[0];
    this->_entries_p = &_entries_device[0];

-    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
-	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
    
    for(int ii=0;ii<npoints;ii++){

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

+typedef int acceleratorEvent_t;
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //auto discard=cudaStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}


 inline int  acceleratorIsCommunicable(void *ptr)
@@ -323,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[intel::reqd_sub_group_size(16)]]			\
+		     [[sycl::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
@@ -343,11 +357,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
-inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
-inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+
+///////
+// Asynch event interface
+///////
+typedef sycl::event acceleratorEvent_t;
+
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  ev.wait();
+}
+
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
+{
+  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
+}
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
+
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}

 inline int  acceleratorIsCommunicable(void *ptr)
@@ -358,8 +389,10 @@ inline int  acceleratorIsCommunicable(void *ptr)
  else return 0;
 #endif
  return 1;
+
 }

+
 #endif

 //////////////////////////////////////////////
@@ -459,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
  void *ptr=NULL;
-  auto err = hipMallocHost((void **)&ptr,bytes);
+  auto err = hipHostMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@@ -492,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}

 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //  auto discard=hipStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
+
+
 #endif

 inline void acceleratorPin(void *ptr,unsigned long bytes)
@@ -545,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)

 #undef GRID_SIMT

+typedef int acceleratorEvent_t;
+
 inline void acceleratorMem(void)
 {
  /*
@@ -565,8 +612,13 @@ inline void acceleratorMem(void)
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific

 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
+
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@@ -655,9 +707,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }

-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)

 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -187,8 +187,9 @@ void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
 		     Coordinate &mpi_c)
 {
-  auto mpi =std::vector<int>({1,1,1,1});
-  auto latt=std::vector<int>({8,8,8,8});
+  auto mpi =std::vector<int>(Nd,1);
+  auto latt=std::vector<int>(Nd,8);
+

  GridThread::SetMaxThreads();

@@ -228,6 +229,9 @@ void GridParseLayout(char **argv,int argc,
  }
  // Copy back into coordinate format
  int nd = mpi.size();
+  //  std::cout << "mpi.size() "<<nd<<std::endl;
+  //  std::cout << "latt.size() "<<latt.size()<<std::endl;
+  //  std::cout << "Nd "<<Nd<<std::endl;
  assert(latt.size()==nd);
  latt_c.resize(nd);
   mpi_c.resize(nd);
@@ -509,7 +513,14 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
+  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@@ -631,12 +642,11 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
-  sigaction(SIGBUS,&sa,NULL);
+  //  sigaction(SIGBUS,&sa,NULL);
  //  sigaction(SIGUSR2,&sa,NULL);

-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-
-  sigaction(SIGFPE,&sa,NULL);
+  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  //  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

@@ -651,3 +661,4 @@ void Grid_debug_handler_init(void)
 }

 NAMESPACE_END(Grid);
+
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -66,6 +66,7 @@ namespace Grid{
  };
 }

+
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@@ -107,8 +108,18 @@ int main(int argc, char **argv) {

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

+#if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
+#else
+  // Don't require Grid format RNGs
+  FieldMetaData header;
+  std::string file, filesmr;
+  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
+  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);

+  NerscIO::readConfiguration(Umu,header,file);
+#endif
+  
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  
@@ -116,6 +127,7 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);

  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
+  
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -165,33 +177,48 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;

-    int tau = std::round(t);
-    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(R,efile);
-    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(qfield,tfile);

+    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
+
+    int tau = std::round(t);
+
+    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(R,efile);
+
+    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(qfield,tfile);
+
+    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
+    {
+      //      PeriodicGimplR::GaugeField Ucopy = U;
+      //      NerscIO::writeConfiguration(Ucopy,ufile);
+    }
+    
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
    
  });
  
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-
+  //  NerscIO::writeConfiguration(Uflow,filesmr);
+  
+  
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning FTHMC2p1f will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@@ -220,7 +227,6 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main

-
-
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
+
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning FTHMC2p1f_3GeV will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@@ -220,6 +228,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main


--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning HMC2p1f_3GeV will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@@ -220,6 +227,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main


--- a/5
+++ b/5
@@ -1,3 +1,8 @@
+
+* Clean up the extract merge and replace with insertLane/extractLane
+
+-----
+
 i)    Refine subspace with HDCG & recompute
 ii)   Block Lanczos in coarse space
 iii)  Batched block project in the operator computation
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -52,7 +52,7 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -175,8 +175,8 @@ public:
 	    timestat.statistics(t_time);
 	  
 	    dbytes=dbytes*ppn;
-	    double xbytes    = dbytes*0.5;
-	    double bidibytes = dbytes;
+	    double xbytes    = dbytes;
+	    double bidibytes = dbytes*2.0;
 	  
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
@@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Dw.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;

      }

@@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;

-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t ni = 100;
+	uint64_t no = 50;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  double t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Dc.M(src,r);
+	  }
+	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@@ -872,7 +877,7 @@ int main (int argc, char ** argv)
  int do_dslash=1;

  int sel=4;
-  std::vector<int> L_list({8,12,16,24});
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;

  std::vector<double> clover;
--- a/configure.ac
+++ b/configure.ac
@@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
 case ${ac_TRACING} in
    nvtx)
        AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
-	LIBS="${LIBS} -lnvToolsExt64_1"
+	LIBS="${LIBS} -lnvToolsExt"
 	;;
    roctx)
        AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
@@ -198,6 +198,8 @@ AC_ARG_ENABLE([Nc],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])

 case ${ac_Nc} in
+     1)
+        AC_DEFINE([Config_Nc],[1],[Gauge group Nc]);;
    2)
        AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
    3)
@@ -211,6 +213,21 @@ case ${ac_Nc} in
    *)
      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
 esac
+############### Nd
+AC_ARG_ENABLE([Nd],
+    [AS_HELP_STRING([--enable-Nd=2|3|4],[enable default LGT dimension])],
+    [ac_Nd=${enable_Nd}], [ac_Nd=4])
+
+case ${ac_Nd} in
+    2)
+        AC_DEFINE([Config_Nd],[2],[Gauge field dimension Nd]);;
+    3)
+        AC_DEFINE([Config_Nd],[3],[Gauge field dimension Nd]);;
+    4)
+        AC_DEFINE([Config_Nd],[4],[Gauge field dimension Nd]);;
+    *)
+      AC_MSG_ERROR(["Unsupport dimension Nd = ${ac_Nd}"]);;
+esac

 ############### Symplectic group
 AC_ARG_ENABLE([Sp],
@@ -818,6 +835,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
+Nd                          : ${ac_Nd}
 Nc                          : ${ac_Nc}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
--- a/examples/Example_Laplacian_smearing.cc
+++ b/examples/Example_Laplacian_smearing.cc
@@ -93,10 +93,13 @@ int main(int argc, char ** argv)
  Real coeff = (width*width) / Real(4*Iterations);

  chi=kronecker;
+
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(chi,psi);
    chi = chi - coeff*psi;
+    RealD n2 = norm2(chi);
+    chi = chi * (1.0/std::sqrt(n2));
  }

  std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
--- a/systems/Aurora/benchmarks/bench16.pbs
+++ b/systems/Aurora/benchmarks/bench16.pbs
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+##PBS -q LatticeQCD_aesp_CNDA
+#PBS -q debug-scaling
+##PBS -q prod
+#PBS -l select=16
+#PBS -l walltime=00:20:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+cp $PBS_NODEFILE nodefile
+
+export OMP_NUM_THREADS=4
+export MPICH_OFI_NIC_POLICY=GPU
+
+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+
+#
+# Local vol 16.16.16.32
+#
+
+LX=16
+LY=16
+LZ=16
+LT=32
+
+NX=2
+NY=2
+NZ=4
+NT=1
+
+GX=2
+GY=2
+GZ=1
+GT=3
+
+PX=$((NX * GX ))
+PY=$((NY * GY ))
+PZ=$((NZ * GZ ))
+PT=$((NT * GT ))
+
+VX=$((PX * LX ))
+VY=$((PY * LY ))
+VZ=$((PZ * LZ ))
+VT=$((PT * LT ))
+
+NP=$((PX*PY*PZ*PT))
+VOL=${VX}.${VY}.${VZ}.${VT}
+AT=8
+MPI=${PX}.${PY}.${PZ}.${PT}
+
+CMD="mpiexec -np $NP -ppn 12  -envall \
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
+
+echo VOL $VOL
+echo MPI $MPI
+echo NPROC $NP
+echo $CMD
+$CMD
+
--- a/systems/Aurora/benchmarks/gpu_tile.sh
+++ b/systems/Aurora/benchmarks/gpu_tile.sh
@@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero

 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
@@ -30,8 +30,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A

 if [ $PALS_RANKID = "0" ]
 then
-    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
-#    numactl -p $NUMAP -N $NUMAP  "$@"
+#    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 else 
    numactl -p $NUMAP -N $NUMAP  "$@"
 fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,18 +1,19 @@
 #Ahead of time compile for PVC

-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"

 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "

-../../configure \
+../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
+	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
--- a/systems/Frontier-rocm631/config-command
+++ b/systems/Frontier-rocm631/config-command
@@ -0,0 +1,22 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=none \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
+
+
+
+
--- a/systems/Frontier-rocm631/sourceme631.sh
+++ b/systems/Frontier-rocm631/sourceme631.sh
@@ -0,0 +1,16 @@
+
+echo spack
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+
+#module load cce/15.0.1
+
+module load rocm/6.3.1
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+
+#Ugly hacks to get down level software working on current system
+#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	c6e88d9a11	T-direction terms done	2025-11-11 08:36:57 -05:00
Peter Boyle	d5c0d54f89	Gauge staples for temporal direction added (ico-T staples and T-ico staples). Passes gauge covariance test, requiring the link x its staples = 1 on a random gauge transform. Still to do: temporal link double store temporal laplacian + covariant laplacian terms Make the "rotate" -> coalescedReadRotate as presently it is hardwired CPU domain in a couple of places in Test_icosahedron	2025-11-06 13:54:39 -05:00
Peter Boyle	d3ca16c76d	Updated	2025-10-27 21:09:02 -04:00
Peter Boyle	d81d00a889	Covariance test of covariant laplacian appears to pass	2025-10-27 19:19:30 -04:00
Peter Boyle	d0ee38d1da	Clean up	2025-10-22 21:44:51 -04:00
Peter Boyle	da8dc3da0d	More compact	2025-10-22 21:37:40 -04:00
Peter Boyle	21514d8487	Added a free laplacian	2025-10-22 21:31:53 -04:00
Peter Boyle	77b2e9fb61	Name changes	2025-10-22 16:46:15 -04:00
Peter Boyle	a71ba05bd7	Implemented gauge transform via stencil. Now have ability to do Vertex AND Edge grids Should now have no barriers to a) Double Storing links for fermion operators / laplacian b) Laplace or Wilson operators	2025-10-22 16:27:06 -04:00
Peter Boyle	1e95e64035	Staples work in icoso-plane	2025-10-21 23:27:27 -04:00
Peter Boyle	defcac92ab	Somewhat better wrapped support for Icosahedral	2025-10-20 18:09:21 -04:00
Peter Boyle	4869378f1e	Now computed some plaquettes. First cut at stencil	2025-10-20 11:15:17 -04:00
Peter Boyle	c7b74db317	Default dimensions fixed	2025-10-09 14:57:22 -04:00
Peter Boyle	0ce201efbe	IcosahedralVerted() checks	2025-10-09 13:35:16 -04:00
Peter Boyle	6d8a3d8bb2	Config	2025-10-09 13:30:16 -04:00
Peter Boyle	7dfd207ebb	Need to protect pole operatoins to only take place on IcosahedralVertices mesh	2025-10-08 15:18:31 -04:00
Peter Boyle	3a65a096f2	Nd verbose	2025-10-07 18:49:00 -04:00
Peter Boyle	85b2bd4c93	Beginnings of S2xR	2025-10-07 16:11:06 -04:00
Peter Boyle	35e10a1159	Changes for Nd=3	2025-10-03 12:17:13 -04:00
Peter Boyle	d418f78352	Making running on Aurora more debuggable	2025-05-23 20:58:16 +00:00
Peter Boyle	25163998a0	Makes SYCL compiler happy	2025-05-23 20:57:11 +00:00
Peter Boyle	dc546aaa4b	Updated config options for BNL cluster	2025-05-13 18:44:47 -04:00
Peter Boyle	5364d580c9	Output chirality, eigenvector density files and python source lego plot	2025-05-13 18:44:47 -04:00
Peter Boyle	2a9a6347e3	Do not require Grid format RNGs and also to the 5Li reporting	2025-05-13 18:44:47 -04:00
Peter Boyle	cfdb56f314	Run measurements at t=0 too	2025-05-13 18:44:46 -04:00
Peter Boyle	b517e88db3	Update README	2025-05-13 16:49:21 -04:00
Peter Boyle	bb317aba8d	Lattice = for sycl	2025-05-13 12:50:58 +00:00
Peter Boyle	644cc6647e	JSON update	2025-05-13 12:50:58 +00:00
Peter Boyle	72397ce23b	SYCL interface change	2025-05-13 12:50:58 +00:00
Peter Boyle	d60a80c098	Fixes and visualisation	2025-04-29 18:04:23 -04:00
Peter Boyle	bb8b6d9d73	Fix	2025-04-29 18:04:04 -04:00
Peter Boyle	677b4cc5b0	Make all tests compile	2025-04-24 20:33:26 -04:00
Peter Boyle	be565ffab6	update mac config command	2025-04-24 14:50:06 -04:00
Peter Boyle	df6120e5f6	CPU compile oops fix	2025-04-24 14:50:06 -04:00
Peter Boyle	21de6f7da8	Merge pull request #477 from lehner/feature/wilson-clover-5d Feature/wilson clover 5d	2025-04-24 14:44:48 -04:00
Peter Boyle	dbe39f9ce0	Merge pull request #471 from edbennett/fix-wflow Shave off rough edges in Wilson flow test	2025-04-24 14:40:31 -04:00
Peter Boyle	ab3de50d5e	Merge pull request #473 from UCL-ARC/gauge_action_deriv WilsonGagueAction deriv	2025-04-24 14:39:10 -04:00
Peter Boyle	c545bd2139	Merge pull request #465 from edbennett/allow-nonsu3-compilation guard against trying to compile SU3-specific code when Nc ≠ 3	2025-04-24 14:35:51 -04:00
Peter Boyle	6a1c64fbdd	Merge pull request #470 from paboyle/specflow Spectral flow, DWF/Mobius kernel measurement	2025-04-24 14:34:33 -04:00
Peter Boyle	b75809ed61	Update README	2025-04-24 14:27:22 -04:00
Peter Boyle	ecaf228e5c	Update README	2025-04-24 14:25:32 -04:00
Peter Boyle	6d015ae8fc	Visualisation tools	2025-04-24 13:47:34 -04:00
Peter Boyle	233150d93f	Bug fix for no accelerator aware MPI, thanks Shuhei for finding it.	2025-04-24 11:40:46 -04:00
Peter Boyle	7af8c77a52	Normalise	2025-04-24 11:37:39 -04:00
Chulwoo Jung	a957e7bfa1	Adding DWF evec Chirality measurement	2025-04-22 22:17:51 +00:00
Chulwoo Jung	cee4c8ce8c	Merge branch 'develop' of https://github.com/paboyle/Grid into specflow	2025-04-18 19:55:36 +00:00
Christoph Lehner	96bf814d8c	Add checkerboarding to 5D compact clover	2025-04-10 23:05:39 +02:00
Christoph Lehner	7ddc422788	CompactWilsonClover5D	2025-04-10 23:05:29 +02:00
Peter Boyle	e652fc2825	Shared Memory test reenabled on every Grid object creation. Const improvements in Accelerator.h	2025-04-07 11:51:40 -04:00
Peter Boyle	a49fa3f8d0	ROCM 6.3.1 appears to work	2025-04-07 11:50:59 -04:00
Peter Boyle	cd452a2f91	Slurm update	2025-04-04 18:40:20 -04:00
Peter Boyle	4f89f603ae	Changes to add back shared memory test on GPU	2025-04-04 18:40:15 -04:00
Peter Boyle	11dc2c5e1d	PVdagM initialise	2025-04-04 18:35:06 -04:00
Peter Boyle	6fec3c15ca	Cleaner printing	2025-04-04 18:35:06 -04:00
Peter Boyle	938c47480f	Updated compile on frontier. Unsatisfactory hacsk	2025-04-04 18:35:06 -04:00
Peter Boyle	3811d19298	Fence	2025-04-04 18:35:06 -04:00
Peter Boyle	83a3ab6b6f	Barrier -- not sure 100% this was needed	2025-04-04 18:35:05 -04:00
Peter Boyle	d66a9af6a3	No compile fix	2025-04-04 18:35:05 -04:00
Peter Boyle	adc90d3a86	NVLINK GET/PUT on cuda aware mpi	2025-04-04 18:35:05 -04:00
Peter Boyle	ebbd015c5c	Deprecate shared memory copy as direction matters on nvidia GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	4ab73b36b2	Deprecate shared memory copy as direction matters on GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	130e07a422	Non hermitian support	2025-04-04 18:35:05 -04:00
Peter Boyle	8f47bb367e	Shifted non herm	2025-04-04 18:35:05 -04:00
Peter Boyle	0c3cb60135	Script update	2025-04-04 18:35:05 -04:00
Peter Boyle	9eae8fca5d	Size outut	2025-04-04 18:35:05 -04:00
Peter Boyle	882a217074	Example of Useful prerequisite installs with spack	2025-03-26 11:28:53 -04:00
Mashy Green	e465fce201	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-03-24 10:12:42 +00:00
Mashy Green	d41542c64b	reverted sp2n test wilsonfundfermiongauge to original	2025-03-24 08:29:15 +00:00
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Mashy Green	785bc7a14f	Adding staple zeroing fix	2025-03-10 12:29:04 +00:00
Mashy Green	1a1fe85428	Merge remote-tracking branch 'upstream' into gauge_action_deriv	2025-03-10 08:37:36 +00:00
Mashy Green	0000d2e558	Merge branch 'develop' into gauge_action_deriv	2025-03-10 08:35:57 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Muhammad Asif	b1ba209696	Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22 ) Co-authored-by: Mashy Green <mashy@me.com> merging no-su3 patch	2025-02-24 11:38:42 +00:00
Muhammad Asif	cb3e529b1e	Merge branch 'paboyle:develop' into develop	2025-02-24 11:29:09 +00:00
Mashy Green	717f647418	added the WilsonFlow patch from upstream PR #471	2025-02-24 08:41:31 +00:00
Mashy Green	98e7418187	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-02-24 08:33:05 +00:00
Mashy Green	fe05bf48b1	Improvements to WilsonGaugeAction deriv function (#16 ) * patched version + modifications to deriv -> staple in qcd/gauge * Cleaning up and aligning variable naming between action deriv versions * Removing the regresion test files that were also in this branch for a clean PR * Reverting whitespace changes * Fixing after revering too much! --------- Co-authored-by: Mashy Green <mashy@me.com>	2025-02-17 18:52:04 +00:00
Mashy Green	d2dd8f54e2	Fixing after revering too much!	2025-02-17 17:32:27 +00:00
Mashy Green	7726ee4b16	Reverting whitespace changes	2025-02-17 17:16:28 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Ed Bennett	8729c46169	add clover energy density measurement to default WilsonFlow measurements	2025-02-03 14:27:55 +00:00
Ed Bennett	09f81fe7c3	don't force energy density measurement to be every wilson flow iteration	2025-02-03 14:27:45 +00:00
Ed Bennett	1876e5b7c0	correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface	2025-02-03 14:27:29 +00:00
Mashy Green	355ec76257	Merge pull request #18 from UCL-ARC/bugfix/nvtx Bugfix/nvtx	2025-02-03 11:05:42 +00:00
Peter Boyle	b50fb34e71	Perf on Aurora	2025-02-01 18:39:34 +00:00
Peter Boyle	de84d730ff	Fastest run config on Aurora to date	2025-02-01 18:08:40 +00:00
Peter Boyle	c74d11e3d7	PVdagM MG	2025-02-01 11:04:13 -05:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Peter Boyle	c4fc972fec	Merge branch 'feature/deprecate-uvm' into develop	2025-01-31 16:32:36 +00:00
Mashy Green	4f17c8d081	Merge branch 'paboyle:develop' into bugfix/nvtx	2025-01-29 13:10:12 +00:00
Mashy Green	aaab753982	Reverting to older version of nvtx for Tursa support	2025-01-29 12:57:38 +00:00
Chulwoo Jung	570b72a47b	Bugfix. Sorry!	2025-01-21 15:37:39 -05:00
Chulwoo Jung	a5798a89ed	Merge branch 'develop' into specflow	2025-01-21 12:13:24 -05:00
Peter Boyle	3f3661a86f	Heading towards PVdagM multigrid	2025-01-17 14:33:35 +00:00
Chulwoo Jung	f7e2f9a401	Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement	2025-01-16 20:47:33 +00:00
Chulwoo Jung	2848a9b558	DWF Kernel lanczos working(?)	2025-01-16 01:29:56 +00:00
Mashy Green	d4868991af	Fixed wrong lib for NVTX in configure.ac and updated to nvtx3	2025-01-10 14:53:19 +00:00
Mashy Green	e99d42404e	Removing the regresion test files that were also in this branch for a clean PR	2024-12-16 16:31:22 +00:00
Mashy Green	3ba019c747	Cleaning up and aligning variable naming between action deriv versions	2024-12-03 15:23:00 +00:00
Mashy Green	47429218bb	patched version + modifications to deriv -> staple in qcd/gauge	2024-11-27 16:29:22 +00:00
Peter Boyle	5a4f9bf2e3	Force the ROCM version	2024-10-29 18:12:31 -04:00
Peter Boyle	f617468e04	Update Lattice_base.h	2024-10-11 10:39:16 -04:00
Peter Boyle	ee4046fe92	Added a dimension ordered column sum based reduction for scalar. Removes dependence on MPI_Allreduce and allows for work around on systems where this is bollox.	2024-09-27 09:26:03 -04:00
Peter Boyle	2a9cfeb9ea	New files	2024-09-26 14:23:29 -04:00
Peter Boyle	1147b8ea40	Cheby poly setup	2024-09-26 14:20:32 -04:00
Peter Boyle	3f9119b39d	Remove vectors used for the power spectrum table in paper	2024-09-26 14:19:41 -04:00
Peter Boyle	35e8225abd	Verbose control	2024-09-26 14:18:35 -04:00
Peter Boyle	bdbfbb7a14	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-09-26 14:05:45 -04:00
Peter Boyle	f7d4be8d96	Calculate bytes correctly	2024-09-26 14:04:44 -04:00
Ed Bennett	8d305df0db	guard against trying to compile SU3-specific code when Nc ≠ 3	2024-05-24 14:00:56 +01:00
				`@@ -0,0 +1 @@`
				`../CompactWilsonCloverFermion5DInstantiation.cc.master`