Update for new stencil compression options

Compressed comms options as Sloppy
Preparing for compressed comms
2025-06-20 16:56:55 +01:00 · 2025-06-17 18:06:19 +02:00 · 2025-06-17 16:43:53 +02:00 · 2025-06-17 16:38:10 +02:00 · 2025-06-13 17:32:05 +02:00 · 2025-06-13 16:42:01 +02:00
120 changed files with 4878 additions and 1334 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@ -51,11 +51,13 @@ directory
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
+#pragma nv_diag_suppress declared_but_not_referenced
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
+#pragma diag_suppress declared_but_not_referenced
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -191,7 +191,7 @@ public:
      
    Lattice<sobj> pgbuf(&pencil_g);
    autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@ -215,7 +215,7 @@ public:
    else if ( sign == forward ) div = 1.0;
    else assert(0);
      
-    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Nd), gcoor(Nd);
    result = source;
    int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
      }
    }
      
-    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
    // Loop over orthog coords
    int NN=pencil_g.lSites();
    GridStopWatch timer;
@ -274,7 +274,7 @@ public:
    usec += timer.useconds();
    flops+= flops_call*NN;
      
-    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
    }
    result = result*div;
      
-    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
 #endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -277,6 +277,38 @@ public:
    assert(0);
  }
 };
+template<class Matrix,class Field>
+class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD shift;
+public:
+  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    out = out + shift*in;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};

 //////////////////////////////////////////////////////////
 // Even Odd Schur decomp operators; there are several
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -269,7 +269,9 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
+    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
+    grid->Barrier();

    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -208,8 +208,8 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -414,8 +453,8 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);

    int lda = m; // m x k column major
    int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
      } else { 
 	assert(0);
      }
@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
+    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -97,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    ConjugateGradient<FineField> CG(1.0e-3,400,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@ -110,7 +110,7 @@ public:
      
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      for(int i=0;i<1;i++){
+      for(int i=0;i<4;i++){

 	CG(hermop,noise,subspace[b]);

@ -146,7 +146,7 @@ public:
      
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;

-      for(int i=0;i<3;i++){
+      for(int i=0;i<2;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
 	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -441,8 +441,20 @@ public:
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
 #else
+  //////////////////////////////////////////////////////////////////////
+  // Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    CoarsenOperator(linop,Subspace,Subspace);
+  }
+  //////////////////////////////////////////////////////////////////////
+  // Petrov - Galerkin projection of matrix
+  //////////////////////////////////////////////////////////////////////
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & U,
+		       Aggregation<Fobj,CComplex,nbasis> & V)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
@ -458,11 +470,9 @@ public:
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
+    blockOrthogonalise(InnerProd,V.subspace);
+    blockOrthogonalise(InnerProd,U.subspace);

-    //    for(int s=0;s<Subspace.subspace.size();s++){
-      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
-    //    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -542,7 +552,7 @@ public:
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
+	phaV = phaF[p]*V.subspace[i];
 	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
@ -555,7 +565,7 @@ public:
 	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
+	blockProject(coarseInner,MphaV,U.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -69,7 +69,7 @@ public:
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -234,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -149,7 +149,8 @@ public:
 			    sizeof(obj),d*100+p);

      }
-      CommsComplete(list);
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
      for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
      }
@ -182,6 +183,7 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
+  int IsOffNode(int rank);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@ -200,9 +202,9 @@ public:
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);

  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,
+				    void *xmit,void *xmit_comp,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,
+				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
 				    int xbytes,int rbytes,int dir);
  
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -260,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("AllReduce");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@ -388,11 +395,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-
+int CartesianCommunicator::IsOffNode(int rank)
+{
+  int grank = ShmRanks[rank];
+  if ( grank == MPI_UNDEFINED ) return true;
+  else return false;
+}

 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@ -407,9 +419,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@ -433,24 +445,35 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
+      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
+#ifdef NVLINK_GET
+    else { 
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
+      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+    }
+#endif
  }
-  
+  // This is a NVLINK PUT  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
+#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+#endif
    }
  }
  return off_node_bytes;
@ -459,7 +482,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
-
+  /*finishes Get/Put*/
  acceleratorCopySynchronise();

  if (nreq==0) return;
@ -660,9 +683,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@ -746,26 +769,31 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  //  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D

-  //  if (nreq==0) return;
-  //  std::vector<MPI_Status> status(nreq);
-  //  std::vector<MPI_Request> MpiRequests(nreq);
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
+    
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }

-  //  for(int r=0;r<nreq;r++){
-  //    MpiRequests[r] = list[r].req;
-  //  }
+  int nreq=MpiRequests.size();
+
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
+  }
  
-  //  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
-  //  assert(ierr==0);
-
  //  for(int r=0;r<nreq;r++){
  //    if ( list[r].PacketType==InterNodeRecv ) {
  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
  //    }
  //  }
  
-  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
  
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
@ -780,6 +808,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque

 void CartesianCommunicator::StencilBarrier(void)
 {
+  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@ -787,11 +816,13 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
+  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@ -810,6 +841,7 @@ void CartesianCommunicator::BarrierWorld(void){
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@ -832,6 +864,7 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
+  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }

+int CartesianCommunicator::IsOffNode(int rank) { return false; }
+
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -137,7 +137,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -542,38 +542,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  printf("Host buffer allocate for GPU non-aware MPI\n");
-#if 0
-  HostCommBuf= acceleratorAllocHost(bytes);
-#else 
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#ifdef HAVE_NUMAIF_H
-  #warning "Moving host buffers to specific NUMA domain"
-  int numa;
-  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
-  if(numa_name) {
-    unsigned long page_size = sysconf(_SC_PAGESIZE);
-    numa = atoi(numa_name);
-    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(page_count);
-    std::vector<int>    nodes(page_count,numa);
-    std::vector<int>    status(page_count,-1);
-    for(unsigned long p=0;p<page_count;p++){
-      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
-    }
-    int ret = move_pages(0,
-			 page_count,
-			 &pages[0],
-			 &nodes[0],
-			 &status[0],
-			 MPOL_MF_MOVE);
-    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
-    if (ret) perror(" move_pages failed for reason:");
-  }
-#endif  
-  acceleratorPin(HostCommBuf,bytes);
-#endif  
-
+  //  acceleratorPin(HostCommBuf,bytes);
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
@ -916,14 +887,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  acceleratorCopyToDevice(src,dest,bytes);
-#else   
-  bcopy(src,dest,bytes);
-#endif
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+//  acceleratorCopyToDevice(src,dest,bytes);
+//#else   
+//  bcopy(src,dest,bytes);
+//#endif
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@ -959,6 +930,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);

    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();

@ -989,7 +961,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //SharedMemoryTest();
+  //  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@ -1011,19 +983,18 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
-    ShmBarrier();
  }
+  ShmBarrier();
+  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 }

 void *SharedMemory::ShmBuffer(int rank)
@ -1039,11 +1010,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
+  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
    return (void *) remote;
  }
 }
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-  acceleratorCopyToDevice(src,dest,bytes);
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//  acceleratorCopyToDevice(src,dest,bytes);
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -126,8 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
-  static hostVector<vobj> hsend_buf;  hsend_buf.resize(buffer_size);
-  static hostVector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size);
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  
  int cb= (cbmask==0x2)? Odd : Even;
@ -244,7 +244,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;

-
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 0
+#if 1
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
  
  //copy offsets to device
-  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
  
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
    exit(EXIT_FAILURE);
  }
  
-  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
  accelerator_barrier();
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -510,7 +510,6 @@ public:
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
 				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
-      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
 #endif
      t_comms+=usecond()-t;
     }
@ -531,7 +530,6 @@ public:
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
-      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
 #endif      
      t_comms+=usecond()-t;
    }
@ -555,8 +553,13 @@ public:

    t=usecond();
    grid->CommsComplete(fwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
-
+    
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@ -565,6 +568,11 @@ public:

    t=usecond();
    grid->CommsComplete(bwd_req);
+#ifndef ACCELERATOR_AWARE_MPI
+    for ( int d=0;d < depth ; d ++ ) {
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+    }
+#endif
    t_comms+= usecond() - t;
    
    t=usecond();
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
@ -0,0 +1,196 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+
+    Copyright (C) 2020 - 2025
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
+
+template<class Impl, class CloverHelpers>
+class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
+				     public WilsonCloverHelpers<Impl>,
+				     public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion5D<Impl>            WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion5D(GaugeField& _Umu,
+			       GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       const RealD _mass,
+			       const RealD _csw_r = 0.0,
+			       const RealD _csw_t = 0.0,
+			       const RealD _cF = 1.0,
+			       const ImplParams& impl_p = ImplParams());
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
+
+  void M(const FermionField& in, FermionField& out) override;
+
+  void Mdag(const FermionField& in, FermionField& out) override;
+
+  void Meooe(const FermionField& in, FermionField& out) override;
+
+  void MeooeDag(const FermionField& in, FermionField& out) override;
+
+  void Mooee(const FermionField& in, FermionField& out) override;
+
+  void MooeeDag(const FermionField& in, FermionField& out) override;
+
+  void MooeeInv(const FermionField& in, FermionField& out) override;
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override;
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle);
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override;
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+  int n_rhs;
+  
+  bool fixedBoundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS

 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;

 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;

+typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
+typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
+typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
+
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -154,6 +154,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -179,6 +179,12 @@ public:
  StencilImpl Stencil; 
  StencilImpl StencilEven; 
  StencilImpl StencilOdd; 
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
    
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -146,6 +146,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-///////////////////////////////////////////////////////////////
-// Wilson compressor will need FaceGather policies for:
-// Periodic, Dirichlet, and partial Dirichlet for DWF
-///////////////////////////////////////////////////////////////
-const int dwf_compressor_depth=2;
-#define DWF_COMPRESS
-class FaceGatherPartialDWF
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else
-  static int PartialCompressionFactor(GridBase *grid) { return 1;}
-#endif
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-				   const Lattice<vobj> &rhs,
-				   cobj *buffer,
-				   compressor &compress,
-				   int off,int so,int partial)
-  {
-    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
-    //  Shrinks local and remote comms buffers
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else 
-    int depth=Ls/2;
-#endif
-    std::pair<int,int> *table_v = & table[0];
-    auto rhs_v = rhs.View(AcceleratorRead);
-    int vol=table.size()/Ls;
-    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
-	Integer i=idx/Ls;
-	Integer s=idx%Ls;
-	Integer sc=depth+s-(Ls-depth);
-	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
-	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
-    });
-    rhs_v.ViewClose();
-  }
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    auto Ls = dd.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth=Ls/2;
-#endif    
-    // Just pass in the Grid
-    auto kp = dd.kernel_p;
-    auto mp = dd.mpi_p;
-    int size= dd.buffer_size;
-    int vol= size/Ls;
-    accelerator_forNB(o,size,1,{
-	int idx=o/Ls;
-	int   s=o%Ls;
-	if ( s < depth ) {
-	  int oo=s*vol+idx;
-	  kp[o]=mp[oo];
-	} else if ( s >= Ls-depth ) {
-	  int sc = depth + s - (Ls-depth);
-	  int oo=sc*vol+idx; 
-	  kp[o]=mp[oo];
-	} else {
-	  kp[o] = Zero();//fill rest with zero if partial dirichlet
-	}
-    });
-  }
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Need to gather *interior portions* for ALL s-slices in simd directions
-  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
-  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    
-    // insertion of zeroes...
-    assert( (table.size()&0x1)==0);
-    int num=table.size()/2;
-    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
-    
-    auto rhs_v = rhs.View(AcceleratorRead);
-    auto p0=&pointers[0][0];
-    auto p1=&pointers[1][0];
-    auto tp=&table[0];
-    int nnum=num/Ls;
-    accelerator_forNB(j, num, vobj::Nsimd(), {
-	//  Reorders both local and remote comms buffers
-	//  
-	int s  = j % Ls;
-	int sp1 = (s+depth)%Ls;  // peri incremented s slice
-	
-	int hxyz= j/Ls;
-
-	int xyz0= hxyz*2; // xyzt part of coor
-	int xyz1= hxyz*2+1;
-	
-	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-	
-	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
-	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
-	compress.CompressExchange(p0[jj],p1[jj],
-				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
-				  rhs_v[so+tp[kk1 ].second], 
-				  type);
-    });
-    rhs_v.ViewClose();
-  }
-  // Merge routine is for SIMD faces
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    auto Ls = mm.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
-    auto mp = &mm.mpointer[0];
-    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
-    auto vp1= &mm.vpointers[1][0];
-    auto type= mm.type;
-    int nnum = num/Ls;
-    accelerator_forNB(o,num,Merger::Nsimd,{
-
-	int  s=o%Ls;
-	int hxyz=o/Ls; // xyzt related component
-	int xyz0=hxyz*2;
-	int xyz1=hxyz*2+1;
-
-	int sp = (s+depth)%Ls; 
-	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-
-	int oo0= s+xyz0*Ls;
-	int oo1= s+xyz1*Ls;
-
-	// same ss0, ss1 pair goes to new layout
-	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
-      });
-  }
-};
-class FaceGatherDWFMixedBCs
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else 
-  static int PartialCompressionFactor(GridBase *grid) {return 1;}
-#endif
-  
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-					 const Lattice<vobj> &rhs,
-					 cobj *buffer,
-					 compressor &compress,
-					 int off,int so,int partial)
-  {
-    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-  }
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-  }
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    int partial = mm.partial;
-    //    std::cout << " merge DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
-    else           FaceGatherSimple::MergeFace(decompress,mm);
-  }
-
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    int partial = dd.partial;
-    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
-    else           FaceGatherSimple::DecompressFace(decompress,dd);
-  }
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@ -242,8 +39,7 @@ public:

 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
-//  : public FaceGatherSimple
+class WilsonCompressorTemplate : public FaceGatherSimple
 {
 public:
  
@ -485,7 +281,6 @@ public:
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
 #ifdef NVLINK_GET
-    #warning "NVLINK_GET"
    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -165,6 +165,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -91,13 +91,13 @@ public:
  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};

  // half checkerboard operations; leave unimplemented as abstract for now
-  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);

-  virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

@ -204,7 +204,14 @@ public:
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
-    
+
+
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;

--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
@ -0,0 +1,376 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+
+
+NAMESPACE_BEGIN(Grid);
+template<class Impl, class CloverHelpers>
+CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
+										GridCartesian         &FiveDimGrid,
+										GridRedBlackCartesian &FiveDimRedBlackGrid,
+										GridCartesian         &FourDimGrid,
+										GridRedBlackCartesian &FourDimRedBlackGrid,
+										const RealD _mass,
+										const RealD _csw_r,
+										const RealD _csw_t,
+										const RealD _cF,
+										const ImplParams& impl_p)
+  : WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
+  , csw_r(_csw_r)
+  , csw_t(_csw_t)
+  , cF(_cF)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , Diagonal(&FourDimGrid),        Triangle(&FourDimGrid)
+  , DiagonalEven(&FourDimRedBlackGrid),    TriangleEven(&FourDimRedBlackGrid)
+  , DiagonalOdd(&FourDimRedBlackGrid),     TriangleOdd(&FourDimRedBlackGrid)
+  , DiagonalInv(&FourDimGrid),     TriangleInv(&FourDimGrid)
+  , DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
+  , DiagonalInvOdd(&FourDimRedBlackGrid),  TriangleInvOdd(&FourDimRedBlackGrid)
+  , Tmp(&FiveDimGrid)
+  , BoundaryMask(&FiveDimGrid)
+  , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
+{
+  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+
+  csw_r *= 0.5;
+  csw_t *= 0.5;
+  //if (clover_anisotropy.isAnisotropic)
+  //  csw_r /= clover_anisotropy.xi_0;
+
+  ImportGauge(_Umu);
+  if (fixedBoundaries) {
+    this->BoundaryMaskEven.Checkerboard() = Even;
+    this->BoundaryMaskOdd.Checkerboard() = Odd;
+    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::Dhop(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopOE(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopEO(in, out, dag);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+  WilsonBase::DhopDir(in, out, dir, disp);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+  WilsonBase::DhopDirAll(in, out);
+  if(this->fixedBoundaries) {
+    for(auto& o : out) ApplyBoundaryMask(o);
+  }
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+  Mooee(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+  MooeeDag(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+  WilsonBase::Meooe(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+  WilsonBase::MeooeDag(in, out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalEven, TriangleEven);
+    }
+  } else {
+    MooeeInternal(in, out, Diagonal, Triangle);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+  Mooee(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+    }
+  } else {
+    MooeeInternal(in, out, DiagonalInv, TriangleInv);
+  }
+  if(fixedBoundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+  MooeeInv(in, out); // blocks are hermitian
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+  DhopDirAll(in, out);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
+
+  // NOTE: code copied from original clover term
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+        continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+								      FermionField&              out,
+								      const CloverDiagonalField& diagonal,
+								      const CloverTriangleField& triangle) {
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  out.Checkerboard() = in.Checkerboard();
+  conformable(in, out);
+  CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
+}
+
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+  // NOTE: parts copied from original implementation
+
+  // Import gauge into base class
+  double t0 = usecond();
+  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+  // Initialize temporary variables
+  double t1 = usecond();
+  conformable(_Umu.Grid(), this->GaugeGrid());
+  GridBase* grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+  CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);
+
+  // Compute the field strength terms mu>nu
+  double t2 = usecond();
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  double t3 = usecond();
+  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
+  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Convert the data layout of the clover term
+  double t5 = usecond();
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
+  double t6 = usecond();
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
+
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
+  double t7 = usecond();
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
+
+  // Fill the remaining clover fields
+  double t8 = usecond();
+  pickCheckerboard(Even, DiagonalEven,    Diagonal);
+  pickCheckerboard(Even, TriangleEven,    Triangle);
+  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+  // Report timings
+  double t9 = usecond();
+
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -484,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
  Dhop(in,out,dag); // -0.5 is included
  axpy(out,4.0-M5,in,out);
 }
+template <class Impl>
+void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(4.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(4.0 + M5))*in;
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
+}

 template<class Impl>
 void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  acceleratorSynchronise();						\
+  acceleratorSynchronise();					\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);

@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);

   if( interior && exterior ) {
-     //     acceleratorFenceComputeStream();
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermion5DInstantiation.cc.master
@ -0,0 +1,45 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
+
+    Copyright (C) 2017 - 2025
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
+    Author: Christoph Lehner <christoph@lhnr.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
+#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermion5DInstantiationWilsonImplD.cc
@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermion5DInstantiationWilsonImplF.cc
@ -0,0 +1 @@
+../CompactWilsonCloverFermion5DInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@ -62,7 +62,7 @@ do
 done
 done

-CC_LIST="CompactWilsonCloverFermionInstantiation"
+CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"

 for impl in $COMPACT_WILSON_IMPL_LIST
 do
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@ -76,27 +76,27 @@ public:
    return action;
  };

-  virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
    //extend Ta to include Lorentz indexes
    RealD factor_p = c_plaq/RealD(Nc)*0.5;
    RealD factor_r = c_rect/RealD(Nc)*0.5;

-    GridBase *grid = Umu.Grid();
+    GridBase *grid = U.Grid();

-    std::vector<GaugeLinkField> U (Nd,grid);
+    std::vector<GaugeLinkField> Umu (Nd,grid);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+      Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
    }
    std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
-    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
+    WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);

    GaugeLinkField dSdU_mu(grid);
    GaugeLinkField staple(grid);

    for (int mu=0; mu < Nd; mu++){
-      dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
-      dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
-	  
+      dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
+      dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }

--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@ -73,20 +73,23 @@ public:
    // extend Ta to include Lorentz indexes

    RealD factor = 0.5 * beta / RealD(Nc);
+    GridBase *grid = U.Grid();

-    GaugeLinkField Umu(U.Grid());
-    GaugeLinkField dSdU_mu(U.Grid());
+    GaugeLinkField dSdU_mu(grid);
+    std::vector<GaugeLinkField> Umu(Nd, grid);
    for (int mu = 0; mu < Nd; mu++) {
+      Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
+    }

-      Umu = PeekIndex<LorentzIndex>(U, mu);
-      
+    for (int mu = 0; mu < Nd; mu++) {
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
-      
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
+      dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
+
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
  }
+
 private:
  RealD beta;  
 };
--- a/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@ -111,8 +111,8 @@ public:
  };

  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@ -75,7 +75,7 @@ public:
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng, smr;
-      this->build_filenames(traj, Params, config, rng);
+      this->build_filenames(traj, Params, config, smr, rng);
      GridBase *grid = SmartConfig.get_U(false).Grid();
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@ -102,7 +102,7 @@ public:
      if ( Params.saveSmeared ) { 
 	IldgWriter _IldgWriter(grid->IsBoss());
 	_IldgWriter.open(smr);
-	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
+	_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
 	_IldgWriter.close();

 	std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
@ -118,8 +118,8 @@ public:

  void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {

  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
-    std::string config, rng;
-    this->build_filenames(traj, Params, config, rng);
+    std::string config, rng, smr;
+    this->build_filenames(traj, Params, config, smr, rng);
    this->check_filename(rng);
    this->check_filename(config);

--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {


 /*!  @brief structure holding the link treatment */
-struct SmearingParameters{
-    SmearingParameters(){}
+struct HISQSmearingParameters{
+    HISQSmearingParameters(){}
    Real c_1;               // 1 link
    Real c_naik;            // Naik term
    Real c_3;               // 3 link
    Real c_5;               // 5 link
    Real c_7;               // 7 link
    Real c_lp;              // 5 link Lepage
-    SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
+    HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
        : c_1(c1),
          c_naik(cnaik),
          c_3(c3),
@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {

 private:
    GridCartesian* const _grid;
-    SmearingParameters _linkTreatment;
+    HISQSmearingParameters _linkTreatment;

 public:

@ -117,7 +117,7 @@ public:
    //          IN--u_thin
    void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {

-        SmearingParameters lt = this->_linkTreatment;
+        HISQSmearingParameters lt = this->_linkTreatment;
        auto grid = this->_grid;

        // Create a padded cell of extra padding depth=1 and fill the padding.
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@ -252,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{

  out = in;
  RealD taus = 0.;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(0,taus,out);
+  
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@ -336,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(step,taus,out);
+  
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -292,19 +292,21 @@ public:
  //////////////////////////////////////////////////
  // the sum over all nu-oriented staples for nu != mu on each site
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+  static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {

-    GridBase *grid = Umu.Grid();
-
-    std::vector<GaugeMat> U(Nd, grid);
+    std::vector<GaugeMat> Umu(Nd, U.Grid());
    for (int d = 0; d < Nd; d++) {
-      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+      Umu[d] = PeekIndex<LorentzIndex>(U, d);
    }
-    Staple(staple, U, mu);
+    Staple(staple, Umu, mu);
  }

-  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
-    staple = Zero();
+  static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
+
+    autoView(staple_v, staple, AcceleratorWrite);
+    accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
+        staple_v[i] = Zero();
+    });

    for (int nu = 0; nu < Nd; nu++) {

@ -318,12 +320,12 @@ public:
        //      |
        //    __|
        //
-     
+
        staple += Gimpl::ShiftStaple(
 				     Gimpl::CovShiftForward(
-							    U[nu], nu,
+							    Umu[nu], nu,
 							    Gimpl::CovShiftBackward(
-										    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+										    Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
 				     mu);

        //  __
@ -333,8 +335,8 @@ public:
        //

        staple += Gimpl::ShiftStaple(
-				     Gimpl::CovShiftBackward(U[nu], nu,
-							     Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+				     Gimpl::CovShiftBackward(Umu[nu], nu,
+							     Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
      }
    }
  }
--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@ -30,25 +30,26 @@
 NAMESPACE_BEGIN(Grid);

 uint64_t DslashFullCount;
-uint64_t DslashPartialCount;
+//uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;

 void DslashResetCounts(void)
 {
  DslashFullCount=0;
-  DslashPartialCount=0;
+  //  DslashPartialCount=0;
  DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
  dirichlet = DslashDirichletCount;
-  partial   = DslashPartialCount;
+  partial   = 0;
  full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-void DslashLogPartial(void)  { DslashPartialCount++;}
+//void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}

+deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 

 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
  DefaultImplParams()  {
    dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
  };
 };

@ -69,6 +69,12 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);

+class StencilBuffer
+{
+public:
+  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
+};
+
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
  ///////////////////////////////////////////////////
  // If true, this is partially communicated per face
  ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_recv;
  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
@ -205,16 +211,16 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
-#ifndef ACCELERATOR_AWARE_MPI
-    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
-    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
-#endif
+    void * compressed_send_buf;
+    void * compressed_recv_buf;
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
    Integer xbytes;
    Integer rbytes;
+    Integer xbytes_compressed;
+    Integer rbytes_compressed;
  };
  struct Merge {
    static constexpr int Nsimd = vobj::Nsimd();
@ -223,7 +229,7 @@ public:
    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct Decompress {
@ -231,7 +237,7 @@ public:
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct CopyReceiveBuffer {
@ -252,9 +258,45 @@ public:

 protected:
  GridBase *                        _grid;
+
+  ///////////////////////////////////////////////////
+  // Sloppy comms will make a second buffer upon comms
+  ///////////////////////////////////////////////////
+  size_t device_heap_top;  //
+  size_t device_heap_bytes;//
+  size_t device_heap_size; //
+  void *DeviceBufferMalloc(size_t bytes)
+  {
+    void *ptr = (void *)device_heap_top;
+    device_heap_top  += bytes;
+    device_heap_bytes+= bytes;
+    if ( device_heap_bytes > device_heap_size ) {
+      std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
+      assert (device_heap_bytes <= device_heap_size);
+    }
+    return ptr;
+  }
+  void  DeviceBufferFreeAll(void)
+  {
+    device_heap_size = _unified_buffer_size*sizeof(cobj);
+    // Resize up if necessary, never down
+    if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
+      StencilBuffer::DeviceCommBuf.resize(device_heap_size);
+    }
+    device_heap_top  =(size_t) &StencilBuffer::DeviceCommBuf[0];
+    device_heap_size = StencilBuffer::DeviceCommBuf.size();
+    device_heap_bytes=0;
+  }
+
 public:
  GridBase *Grid(void) const { return _grid; }

+  /////////////////////////////////////////////////////////
+  // Control reduced precision comms
+  /////////////////////////////////////////////////////////
+  int SloppyComms;
+  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
+
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
  // without adding parameters. Perhaps a template parameter to StenciView is
@ -268,7 +310,7 @@ public:
  }

  int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
  int fullDirichlet;
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;
@ -361,24 +403,145 @@ public:
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
+  void DecompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) return;
+
+    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      typedef typename getPrecision<cobj>::real_scalar_type word;
+      uint64_t words = packet.rbytes/sizeof(word);
+      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+      const uint64_t outer = words/nsimd;
+
+      if(sizeof(word)==8) {
+
+	// Can either choose to represent as float vs double and prec change
+	// OR
+	// truncate the mantissa bfp16 style
+	double *dbuf =(double *) packet.recv_buf;
+	float  *fbuf =(float  *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
+	});
+
+      } else if ( sizeof(word)==4){
+	// Can either choose to represent as half vs float and prec change
+        // OR
+	// truncate the mantissa bfp16 style
+
+	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+  }
+  void CompressPacket(Packet &packet)
+  {
+    packet.xbytes_compressed = packet.xbytes;
+    packet.compressed_send_buf = packet.send_buf;
+
+    packet.rbytes_compressed = packet.rbytes;
+    packet.compressed_recv_buf = packet.recv_buf;
+
+    if ( !SloppyComms  ) {
+      return;
+    }
+
+    typedef typename getPrecision<cobj>::real_scalar_type word;
+    uint64_t words = packet.xbytes/sizeof(word);
+    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+    const uint64_t outer = words/nsimd;
+
+    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      packet.rbytes_compressed = packet.rbytes/2;
+      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
+      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+      
+    }
+    //else {
+    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+    //    }
+    
+    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
+
+      packet.xbytes_compressed = packet.xbytes/2;
+      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
+      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+
+      if(sizeof(word)==8) {
+
+	double *dbuf =(double *) packet.send_buf;
+	float  *fbuf =(float  *) packet.compressed_send_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
+	});
+
+      } else if ( sizeof(word)==4){
+
+	uint32_t *fbuf =(uint32_t *) packet.send_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
+	
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+
+    }
+    //    else {
+    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+    //    }
+
+    return;
+  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    //    std::cout << "Communicate Begin "<<std::endl;
-    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
+    ///////////////////////////////////////////////
    // All GPU kernel tasks must complete
-    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
-                               // But the HaloGather had a barrier too.
+    //    accelerator_barrier();      All kernels should ALREADY be complete
+    //Everyone is here, so noone running slow and still using receive buffer
+    _grid->StencilBarrier();
+    // But the HaloGather had a barrier too.
+    ///////////////////////////////////////////////
+    if (SloppyComms) {
+      DeviceBufferFreeAll();
+    }
+    for(int i=0;i<Packets.size();i++){
+      this->CompressPacket(Packets[i]);
+    }
+    if (SloppyComms) { 
+      accelerator_barrier();
+#ifdef NVLINK_GET
+      _grid->StencilBarrier(); 
+#endif
+    }
+    
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].send_buf,
+					  Packets[i].compressed_send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].recv_buf,
+					  Packets[i].compressed_recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
@ -389,18 +552,22 @@ public:
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
+      //      _grid->Barrier();
      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,
+					Packets[i].send_buf,Packets[i].compressed_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,
+					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+      //      std::cout << "Communicate Begin started "<<i<<std::endl;
+      //      _grid->Barrier();
    }
+    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
    }
  }

@ -415,14 +582,15 @@ public:
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    for(int i=0;i<Packets.size();i++){
+      this->DecompressPacket(Packets[i]);
      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
    }
    FlightRecorder::StepLog("Finish communicate complete");
  }
@ -446,6 +614,7 @@ public:
    Communicate();
    CommsMergeSHM(compress);
    CommsMerge(compress);
+    accelerator_barrier();
  }

  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -518,7 +687,6 @@ public:
    }
    accelerator_barrier(); // All my local gathers are complete
 #ifdef NVLINK_GET
-    #warning "NVLINK_GET"
    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
    // Or issue barrier AFTER the DMA is running
@ -617,7 +785,7 @@ public:
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
@ -626,7 +794,7 @@ public:
  }
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
@ -690,6 +858,7 @@ public:
 	}
      }
    }
+    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@ -709,7 +878,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
-    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
+    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@ -730,8 +899,8 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@ -752,10 +921,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	if ( partialDirichlet ) {
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	}
      }
    }
  }
@ -767,6 +936,7 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
+    SloppyComms = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@ -784,7 +954,7 @@ public:
    this->same_node.resize(npoints);

    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
    fullDirichlet=0;
    for(int d=0;d<p.dirichlet.size();d++){
@ -801,8 +971,8 @@ public:
    this->_entries_host_p = &_entries[0];
    this->_entries_p = &_entries_device[0];

-    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
-	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
+    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
+    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
    
    for(int ii=0;ii<npoints;ii++){

@ -865,7 +1035,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();

-    // Allow for multiple stencils to exist simultaneously
+    // Allow for multiple stencils to be communicated simultaneously
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();

@ -933,7 +1103,8 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();

-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@ -1122,8 +1293,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@ -1158,11 +1329,11 @@ public:
 	int rbytes;

 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@ -1189,7 +1360,8 @@ public:
 	}


-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@ -1223,7 +1395,8 @@ public:
 #endif

 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);

        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@ -1232,8 +1405,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_send = (comms_send) && (!shm_send );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@ -1241,7 +1414,7 @@ public:
 		    xbytes,rbytes);
 	}

-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@ -1263,8 +1436,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;

    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@ -1339,18 +1512,20 @@ public:

 	
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;

 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv ) {

-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
+	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@ -1416,7 +1591,8 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@ -1430,7 +1606,8 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}

--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -67,7 +67,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);


-	GPU_PROP_FMT(totalGlobalMem,"%lld");
+	GPU_PROP_FMT(totalGlobalMem,"%zu");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
 	GPU_PROP(warpSize);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
  auto err = cudaMallocHost((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@ -237,24 +237,38 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };

+typedef int acceleratorEvent_t;
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //auto discard=cudaStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}


 inline int  acceleratorIsCommunicable(void *ptr)
@ -323,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[intel::reqd_sub_group_size(16)]]			\
+		     [[sycl::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
@ -363,8 +377,8 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }

-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}

 inline int  acceleratorIsCommunicable(void *ptr)
@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
  void *ptr=NULL;
-  auto err = hipMallocHost((void **)&ptr,bytes);
+  auto err = hipHostMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@ -511,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}

 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };

+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //  auto discard=hipStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
+
+
 #endif

 inline void acceleratorPin(void *ptr,unsigned long bytes)
@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)

 #undef GRID_SIMT

+typedef int acceleratorEvent_t;
+
 inline void acceleratorMem(void)
 {
  /*
@ -584,8 +612,13 @@ inline void acceleratorMem(void)
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific

 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
+
 inline void acceleratorCopySynchronise(void) {};

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@ -674,9 +707,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
  acceleratorCopySynchronise();
 }

-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)

 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
  });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
+  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -631,12 +638,11 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
-  sigaction(SIGBUS,&sa,NULL);
+  //  sigaction(SIGBUS,&sa,NULL);
  //  sigaction(SIGUSR2,&sa,NULL);

-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-
-  sigaction(SIGFPE,&sa,NULL);
+  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  //  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

@ -651,3 +657,4 @@ void Grid_debug_handler_init(void)
 }

 NAMESPACE_END(Grid);
+
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -50,7 +50,7 @@ namespace Grid{
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@ -66,6 +66,7 @@ namespace Grid{
  };
 }

+
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@ -107,8 +108,18 @@ int main(int argc, char **argv) {

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

+#if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
+#else
+  // Don't require Grid format RNGs
+  FieldMetaData header;
+  std::string file, filesmr;
+  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
+  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);

+  NerscIO::readConfiguration(Umu,header,file);
+#endif
+  
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  
@ -116,6 +127,7 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);

  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
+  
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@ -165,33 +177,48 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;

-    int tau = std::round(t);
-    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(R,efile);
-    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(qfield,tfile);

+    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
+
+    int tau = std::round(t);
+
+    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(R,efile);
+
+    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(qfield,tfile);
+
+    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
+    {
+      //      PeriodicGimplR::GaugeField Ucopy = U;
+      //      NerscIO::writeConfiguration(Ucopy,ufile);
+    }
+    
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
    
  });
  
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-
+  //  NerscIO::writeConfiguration(Uflow,filesmr);
+  
+  
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning FTHMC2p1f will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@ -220,7 +227,6 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main

-
-
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
+
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning FTHMC2p1f_3GeV will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@ -220,6 +228,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main


--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@ -25,13 +25,20 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+
+#if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
 #include <Grid/qcd/smearing/JacobianAction.h>
+#endif

 using namespace Grid;

 int main(int argc, char **argv)
 {
+#if Nc != 3
+#warning HMC2p1f_3GeV will not work for Nc != 3
+  std::cout << "This program will currently only work for Nc == 3." << std::endl;
+#else
  std::cout << std::setprecision(12);
  
  Grid_init(&argc, &argv);
@ -220,6 +227,7 @@ int main(int argc, char **argv)
  TheHMC.Run(SmearingPolicy); // for smearing

  Grid_finalize();
+#endif
 } // main


--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@ -201,8 +201,7 @@ int main(int argc, char **argv) {

  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=0;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  ParamsDir.partialDirichlet=0;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@ -298,11 +297,11 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;

-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    else                      ParamsNum.partialDirichlet = 0;

-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    else                      ParamsDen.partialDirichlet = 0;
    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@ -333,9 +333,9 @@ int main(int argc, char **argv) {
  ParamsF.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDirF.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=1;
-  ParamsDirF.partialDirichlet=1;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  ParamsDir.partialDirichlet=1;
+  //  ParamsDirF.partialDirichlet=1;
+  //  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@ -481,21 +481,21 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;

-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    else                      ParamsNum.partialDirichlet = 0;

-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    else                      ParamsDen.partialDirichlet = 0;
    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
+    //    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));

    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
+    //    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));

    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -166,18 +166,18 @@ int main (int argc, char ** argv)
  }  


+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
-
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){

      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
+    	                      lat*mpi_layout[1],
+                              lat*mpi_layout[2],
+	                      lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
@ -193,101 +193,6 @@ int main (int argc, char ** argv)
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }

-      int ncomm;
-
-      double dbytes;
-      for(int i=0;i<Nloop;i++){
-	double start=usecond();
-
-	dbytes=0;
-	ncomm=0;
-
-	std::vector<CommsRequest_t> requests;
-
-	for(int mu=0;mu<4;mu++){
-	
-
-	  if (mpi_layout[mu]>1 ) {
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	
-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	  
-	  }
-	}
-	Grid.StencilSendToRecvFromComplete(requests,0);
-	Grid.Barrier();
-	double stop=usecond();
-	t_time[i] = stop-start; // microseconds
-	
-      }
-
-      timestat.statistics(t_time);
-
-      dbytes=dbytes*ppn;
-      double xbytes    = dbytes*0.5;
-      //      double rbytes    = dbytes*0.5;
-      double bidibytes = dbytes;
-
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
-               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
-               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
-
-    }
-  }    
-
-
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
-
-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],
-      			      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
-      std::vector<HalfSpinColourVectorD *> xbuf(8);
-      std::vector<HalfSpinColourVectorD *> rbuf(8);
-      Grid.ShmBufferFreeAll();
-      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-      }
-
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
@ -296,45 +201,34 @@ int main (int argc, char ** argv)
 	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
-	for(int mu=0;mu<4;mu++){
-	
+
+	for(int dir=0;dir<8;dir++) {
+
+	  double tbytes;
+	  int mu =dir % 4;
+
 	  if (mpi_layout[mu]>1 ) {
 	  
 	    ncomm++;
-	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests,mu);
-	    requests.resize(0);
+	    if ( dir == mu ) { 
+	      int comm_proc=1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    } else { 
+	      int comm_proc = mpi_layout[mu]-1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    }
+            int tid = omp_get_thread_num();
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
+					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);

-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
-	    requests.resize(0);
-	  
+	    dbytes+=tbytes;
 	  }
-	}
+        }
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
-	
      }

      timestat.statistics(t_time);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -32,18 +32,18 @@
 using namespace std;
 using namespace Grid;

-template<class d>
-struct scal {
-  d internal;
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
 };

-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);

 int main (int argc, char ** argv)
 {
@ -52,39 +52,108 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  Coordinate latt4 = GridDefaultLatt();
-  int Ls=8;
-  for(int i=0;i<argc;i++)
+  int Ls=16;
+  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
+  }

+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,true);
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  /*
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
+  
+  Benchmark(Ls,Dirichlet,true);
+  */
+  
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
+{
+  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();

  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);

-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#undef SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#else
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
-  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
-  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

-  LatticeFermion src   (FGrid); random(RNG5,src);
+ 
+  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
@ -100,46 +169,39 @@ int main (int argc, char ** argv)
  src = src*N2;
 #endif

-
-  LatticeFermion result(FGrid); result=Zero();
-  LatticeFermion    ref(FGrid);    ref=Zero();
-  LatticeFermion    tmp(FGrid);
-  LatticeFermion    err(FGrid);
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  LatticeGaugeField Umu(UGrid);
+  GaugeField Umu(UGrid);
+  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(Umu);
+  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-#if 0
-  Umu=1.0;
-  for(int mu=0;mu<Nd;mu++){
-    LatticeColourMatrix ttmp(UGrid);
-    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
-    //    if (mu !=2 ) ttmp = 0;
-    //    ttmp = ttmp* pow(10.0,mu);
-    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
-  }
-  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
-#endif

+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
-  // replicate across fifth dimension
-  LatticeGaugeField Umu5d(FGrid);
-  std::vector<LatticeColourMatrix> U(4,FGrid);
-  {
-    autoView( Umu5d_v, Umu5d, CpuWrite);
-    autoView( Umu_v  , Umu  , CpuRead);
-    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-      for(int s=0;s<Ls;s++){
-	Umu5d_v[Ls*ss+s] = Umu_v[ss];
-      }
-    }
-  }
+  std::vector<ColourMatrixField> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
+
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;

  if (1)
@ -147,10 +209,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){

-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

-      tmp =adj(U[mu])*src;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@ -167,11 +247,9 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
-  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@ -181,9 +259,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
-
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.SloppyComms(sloppy);
+  Dw.ImportGauge(Umu);
+  
+  int ncall =300;
+  RealD n2e;
+  
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
@ -198,8 +282,8 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;

-    auto nsimd = vComplex::Nsimd();
-    auto simdwidth = sizeof(vComplex);
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);

    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@ -208,28 +292,27 @@ int main (int argc, char ** argv)
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
-    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
    err = ref-result;
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //exit(0);
+    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

-    if(( norm2(err)>1.0e-4) ) {
-      /*
-      std::cout << "RESULT\n " << result<<std::endl;
-      std::cout << "REF   \n " << ref   <<std::endl;
-      std::cout << "ERR   \n " << err   <<std::endl;
-      */
+    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
+      std::cout<<GridLogMessage << "RESULT" << std::endl;
+      //      std::cout << result<<std::endl;
+      std::cout << norm2(result)<<std::endl;
+      std::cout<<GridLogMessage << "REF" << std::endl;
+      std::cout << norm2(ref)<<std::endl;
+      std::cout<<GridLogMessage << "ERR" << std::endl;
+      std::cout << norm2(err)<<std::endl;
+      FGrid->Barrier();
      exit(-1);
    }
-    assert (norm2(err)< 1.0e-4 );
+    assert (n2e< 1.0e-4 );
  }

  if (1)
@ -238,16 +321,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){

      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
 	}
      }
-
-      tmp =adj(U[mu])*src;
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
@ -259,27 +356,27 @@ int main (int argc, char ** argv)
    }
    ref = -0.5*ref;
  }
-  //  dump=1;
-  Dw.Dhop(src,result,1);
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-/*
-	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
-	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
-	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
-*/
-  }
-  LatticeFermion src_e (FrbGrid);
-  LatticeFermion src_o (FrbGrid);
-  LatticeFermion r_e   (FrbGrid);
-  LatticeFermion r_o   (FrbGrid);
-  LatticeFermion r_eo  (FGrid);
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);

  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
@ -291,10 +388,8 @@ int main (int argc, char ** argv)

  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@ -308,13 +403,7 @@ int main (int argc, char ** argv)
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-#ifdef CUDA_PROFILE
-      if(i==10) cudaProfilerStart();
-#endif
      Dw.DhopEO(src_o,r_e,DaggerNo);
-#ifdef CUDA_PROFILE
-      if(i==20) cudaProfilerStop();
-#endif
    }
    double t1=usecond();
    FGrid->Barrier();
@ -338,14 +427,9 @@ int main (int argc, char ** argv)
  setCheckerboard(r_eo,r_e);

  err = r_eo-result;
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-    /*
-	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
-	std::cout<< "Deo REF\n " <<result  << std::endl;
-	std::cout<< "Deo ERR   \n " << err <<std::endl;
-    */
-  }
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  assert(n2e<1.0e-4);

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
@ -354,6 +438,4 @@ int main (int argc, char ** argv)

  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
-  Grid_finalize();
-  exit(0);
 }
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaT
 };

-void Benchmark(int Ls, Coordinate Dirichlet);
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);

 int main (int argc, char ** argv)
 {
@ -69,11 +69,19 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  
-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,true);

  //////////////////
  // Domain decomposed
  //////////////////
+  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
@ -81,42 +89,35 @@ int main (int argc, char ** argv)
  GlobalSharedMemory::GetShmDims(mpi,shm);


-  //////////////////////
-  // Node level
-  //////////////////////
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];

-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);

  std::cout << "\n\n\n\n\n\n" <<std::endl;

  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  
-  Benchmark(Ls,Dirichlet);
-
+  Benchmark(Ls,Dirichlet,true);
+  */
+  
  Grid_finalize();
  exit(0);
 }
-void Benchmark(int Ls, Coordinate Dirichlet)
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
-#endif
-#ifdef DOUBLE
+#else
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
-#ifdef DOUBLE2
-  typedef vComplexD2          Simd;
-  typedef LatticeFermionD2    FermionField;
-  typedef LatticeGaugeFieldD2 GaugeField;
-  typedef LatticeColourMatrixD2 ColourMatrixField;
-  typedef DomainWallFermionD2 FermionAction;
-#endif
  
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  
  int ncall =300;
--- a/benchmarks/Benchmark_dwf_fp32_partial.cc
+++ b/benchmarks/Benchmark_dwf_fp32_partial.cc
@ -1,465 +0,0 @@
- /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./benchmarks/Benchmark_dwf.cc
-    Copyright (C) 2015
-
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#ifdef GRID_CUDA
-#define CUDA_PROFILE
-#endif
-
-#ifdef CUDA_PROFILE
-#include <cuda_profiler_api.h>
-#endif
-
-using namespace std;
-using namespace Grid;
-
-////////////////////////
-/// Move to domains ////
-////////////////////////
-
-Gamma::Algebra Gmu [] = {
-			 Gamma::Algebra::GammaX,
-			 Gamma::Algebra::GammaY,
-			 Gamma::Algebra::GammaZ,
-			 Gamma::Algebra::GammaT
-};
-
-void Benchmark(int Ls, Coordinate Dirichlet, int partial);
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-
-  int threads = GridThread::GetThreads();
-
-  int Ls=8;
-  for(int i=0;i<argc;i++) {
-    if(std::string(argv[i]) == "-Ls"){
-      std::stringstream ss(argv[i+1]); ss >> Ls;
-    }
-  }
-
-  //////////////////
-  // With comms
-  //////////////////
-  Coordinate Dirichlet(Nd+1,0);
-
-  for(auto partial : {0}) {
-  std::cout << "\n\n\n\n\n\n" <<std::endl;
-  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
-  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-
-  //////////////////
-  // Domain decomposed
-  //////////////////
-  Coordinate latt4  = GridDefaultLatt();
-  Coordinate mpi    = GridDefaultMpi();
-  Coordinate CommDim(Nd);
-  //Coordinate shm({2,1,1,1});
-  Coordinate shm;
-  GlobalSharedMemory::GetShmDims(mpi,shm);
-
-  std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
-
-  //////////////////////
-  // Node level
-  //////////////////////
-  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  for(int d=0;d<Nd;d++) CommDim[d]= 1;
-  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
-
-  for(auto partial : {0,1}) {
-    std::cout << "\n\n\n\n\n\n" <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-  
-  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
-  
-  for(auto partial : {0,1}) {
-    std::cout << "\n\n\n\n\n\n" <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-  Grid_finalize();
-  exit(0);
-}
-void Benchmark(int Ls, Coordinate Dirichlet, int partial)
-{
-  Coordinate latt4 = GridDefaultLatt();
-  GridLogLayout();
-
-  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-#define SINGLE
-#ifdef SINGLE
-  typedef vComplexF          Simd;
-  typedef LatticeFermionF    FermionField;
-  typedef LatticeGaugeFieldF GaugeField;
-  typedef LatticeColourMatrixF ColourMatrixField;
-  typedef DomainWallFermionF FermionAction;
-#endif
-#ifdef DOUBLE
-  typedef vComplexD          Simd;
-  typedef LatticeFermionD    FermionField;
-  typedef LatticeGaugeFieldD GaugeField;
-  typedef LatticeColourMatrixD ColourMatrixField;
-  typedef DomainWallFermionD FermionAction;
-#endif
-#ifdef DOUBLE2
-  typedef vComplexD2          Simd;
-  typedef LatticeFermionD2    FermionField;
-  typedef LatticeGaugeFieldD2 GaugeField;
-  typedef LatticeColourMatrixD2 ColourMatrixField;
-  typedef DomainWallFermionD2 FermionAction;
-#endif
-  
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
-
-  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-
- 
-  FermionField src   (FGrid); random(RNG5,src);
-#if 0
-  src = Zero();
-  {
-    Coordinate origin({0,0,0,latt4[2]-1,0});
-    SpinColourVectorF tmp;
-    tmp=Zero();
-    tmp()(0)(0)=Complex(-2.0,0.0);
-    std::cout << " source site 0 " << tmp<<std::endl;
-    pokeSite(tmp,src,origin);
-  }
-#else
-  RealD N2 = 1.0/::sqrt(norm2(src));
-  src = src*N2;
-#endif
-
-  FermionField result(FGrid); result=Zero();
-  FermionField    ref(FGrid);    ref=Zero();
-  FermionField    tmp(FGrid);
-  FermionField    err(FGrid);
-
-  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  GaugeField Umu(UGrid);
-  GaugeField UmuFull(UGrid);
-  GaugeField UmuCopy(UGrid);
-  SU<Nc>::HotConfiguration(RNG4,Umu);
-  UmuCopy=Umu;
-  UmuFull=Umu;
-  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-
-  ////////////////////////////////////
-  // Apply BCs
-  ////////////////////////////////////
-  Coordinate Block(4);
-  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
-
-  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
-  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
-
-  DirichletFilter<GaugeField> Filter(Block);
-  Filter.applyFilter(Umu);
-  if(!partial) Filter.applyFilter(UmuCopy);
-  
-  ////////////////////////////////////
-  // Naive wilson implementation
-  ////////////////////////////////////
-  std::vector<ColourMatrixField> U(4,UGrid);
-  std::vector<ColourMatrixField> Ucopy(4,UGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-    Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
-  }
-
-  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
-
-  if (1)
-  {
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      int depth=dwf_compressor_depth;
-      tmp = Cshift(src,mu+1,1);
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v    , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v  , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	autoView( src_v, src    , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-
-  RealD NP = UGrid->_Nprocessors;
-  RealD NN = UGrid->NodeCount();
-
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-  std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
-  std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-#endif
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-
-  FermionAction::ImplParams p;
-  p.dirichlet=Dirichlet;
-  p.partialDirichlet=partial;
-  FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
-  
-  int ncall =1;
-  RealD n2e;
-  
-  if (1) {
-    FGrid->Barrier();
-    Dw.Dhop(src,result,0);
-    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    double t1=usecond();
-    FGrid->Barrier();
-
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=single_site_flops*volume*ncall;
-
-    auto nsimd = Simd::Nsimd();
-    auto simdwidth = sizeof(Simd);
-
-    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
-    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
-
-    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
-    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
-
-    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    err = ref-result;
-    n2e = norm2(err);
-
-    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-
-    if(( n2e>1.0e-4) ) {
-      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
-      FGrid->Barrier();
-
-      DumpSliceNorm("s-slice ref ",ref,1);
-      DumpSliceNorm("s-slice res ",result,1);
-      DumpSliceNorm("s-slice error ",err,1);
-      exit(-1);
-    }
-    assert (n2e< 1.0e-4 );
-  }
-
-  if (1)
-  { // Naive wilson dag implementation
-
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-
-      int depth=dwf_compressor_depth;
-      tmp = Cshift(src,mu+1,1);
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v    , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v  , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	autoView( src_v, src    , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  Dw.Dhop(src,result,DaggerYes);
-
-  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
-  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
-
-  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
-  err = ref-result;
-  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-
-  assert((n2e)<1.0e-4);
-  
-  FermionField src_e (FrbGrid);
-  FermionField src_o (FrbGrid);
-  FermionField r_e   (FrbGrid);
-  FermionField r_o   (FrbGrid);
-  FermionField r_eo  (FGrid);
-
-  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
-  pickCheckerboard(Even,src_e,src);
-  pickCheckerboard(Odd,src_o,src);
-
-  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
-  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
-
-
-  // S-direction is INNERMOST and takes no part in the parity.
-  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-#endif
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  {
-    FGrid->Barrier();
-    Dw.DhopEO(src_o,r_e,DaggerNo);
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.DhopEO(src_o,r_e,DaggerNo);
-    }
-    double t1=usecond();
-    FGrid->Barrier();
-
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(single_site_flops*volume*ncall)/2.0;
-
-    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-  }
-  Dw.DhopEO(src_o,r_e,DaggerNo);
-  Dw.DhopOE(src_e,r_o,DaggerNo);
-  Dw.Dhop  (src  ,result,DaggerNo);
-
-  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
-  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
-  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
-
-  setCheckerboard(r_eo,r_o);
-  setCheckerboard(r_eo,r_e);
-
-  err = r_eo-result;
-  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-  assert(n2e<1.0e-4);
-
-  pickCheckerboard(Even,src_e,err);
-  pickCheckerboard(Odd,src_o,err);
-  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
-  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-
-  assert(norm2(src_e)<1.0e-4);
-  assert(norm2(src_o)<1.0e-4);
-}
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Dw.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;

      }

@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;

-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;

 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t ni = 100;
+	uint64_t no = 50;

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  double t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Dc.M(src,r);
+	  }
+	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
      
      }

@ -872,7 +877,7 @@ int main (int argc, char ** argv)
  int do_dslash=1;

  int sel=4;
-  std::vector<int> L_list({8,12,16,24});
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;

  std::vector<double> clover;
--- a/configure.ac
+++ b/configure.ac
@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
 case ${ac_TRACING} in
    nvtx)
        AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
-	LIBS="${LIBS} -lnvToolsExt64_1"
+	LIBS="${LIBS} -lnvToolsExt"
 	;;
    roctx)
        AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
--- a/examples/Example_Laplacian_smearing.cc
+++ b/examples/Example_Laplacian_smearing.cc
@ -93,10 +93,13 @@ int main(int argc, char ** argv)
  Real coeff = (width*width) / Real(4*Iterations);

  chi=kronecker;
+
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(chi,psi);
    chi = chi - coeff*psi;
+    RealD n2 = norm2(chi);
+    chi = chi * (1.0/std::sqrt(n2));
  }

  std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,18 +1,19 @@
 #Ahead of time compile for PVC

-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"

 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "

-../../configure \
+../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
+	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
--- a/systems/Frontier-rocm631/config-command
+++ b/systems/Frontier-rocm631/config-command
@ -0,0 +1,22 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=none \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
+
+
+
+
--- a/systems/Frontier-rocm631/sourceme631.sh
+++ b/systems/Frontier-rocm631/sourceme631.sh
@ -0,0 +1,16 @@
+
+echo spack
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+
+#module load cce/15.0.1
+
+module load rocm/6.3.1
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+
+#Ugly hacks to get down level software working on current system
+#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@ -30,14 +30,10 @@ source ${root}/sourceme.sh

 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-
-for vol in 32.32.32.64
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#64.64.32.96
+for vol in 64.64.32.64
 do
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
-
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
-srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
 done

--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
--enable-tracing=timer \
+--enable-tracing=none \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
--enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"



--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -1,12 +1,25 @@
+
+echo spack
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
-spack load c-lime
-module load emacs 
-module load PrgEnv-gnu
-module load rocm/6.0.0
-module load cray-mpich
-module load gmp
+
+module load cce/15.0.1
+module load rocm/5.3.0
 module load cray-fftw
 module load craype-accel-amd-gfx90a
+
+#Ugly hacks to get down level software working on current system
+export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
+
+#echo spack load c-lime
+#spack load c-lime
+#module load emacs 
+##module load PrgEnv-gnu
+##module load cray-mpich
+##module load cray-fftw
+##module load craype-accel-amd-gfx90a
+##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
-#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
+##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
--- a/systems/Jupiter/benchmarks/dwf.1node.perf
+++ b/systems/Jupiter/benchmarks/dwf.1node.perf
@ -0,0 +1,273 @@
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+SLURM detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 1 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0009:01:00.0
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 4
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 81604378624 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+
+
+
+
+
+
+
+Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.309000 s :  Testing with full communication 
+Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.313000 s : Grid Layout
+Grid : Message : 0.313000 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 0.319000 s : 	OpenMP threads       : 4
+Grid : Message : 0.320000 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 0.129590 s : Initialising 4d RNG
+Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 0.942440 s : Initialising 5d RNG
+Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+local rank 1 device 0 bus id: 0019:01:00.0
+local rank 2 device 0 bus id: 0029:01:00.0
+local rank 3 device 0 bus id: 0039:01:00.0
+Grid : Message : 43.893114 s : Drawing gauge field
+Grid : Message : 54.574150 s : Random gauge initialised 
+Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 54.580032 s : Setting up Cshift based reference 
+Grid : Message : 60.407451 s : *****************************************************************
+Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 60.407470 s : *****************************************************************
+Grid : Message : 60.407471 s : *****************************************************************
+Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 60.407473 s : * Vectorising space-time by 8
+Grid : Message : 60.407475 s : * VComplex size is 64 B
+Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
+Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 60.407480 s : *****************************************************************
+Grid : Message : 61.102178 s : Called warmup
+Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
+Grid : Message : 62.177198 s : mflop/s =   24721998.6
+Grid : Message : 62.177201 s : mflop/s per rank =  6180499.64
+Grid : Message : 62.177204 s : mflop/s per node =  24721998.6
+Grid : Message : 62.182696 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 71.328862 s : ----------------------------------------------------------------
+Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 71.328885 s : ----------------------------------------------------------------
+Grid : Message : 71.328886 s : Called DwDag
+Grid : Message : 71.328887 s : norm dag result 4.12810493
+Grid : Message : 71.329493 s : norm dag ref    4.12810493
+Grid : Message : 71.331967 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 71.803650 s : src_e0.500003185
+Grid : Message : 71.819727 s : src_o0.499996882
+Grid : Message : 71.821991 s : *********************************************************
+Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 71.821995 s : * Vectorising space-time by 8
+Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
+Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 71.822003 s : *********************************************************
+Grid : Message : 72.377054 s : Deo mflop/s =   24065467
+Grid : Message : 72.377071 s : Deo mflop/s per rank   6016366.75
+Grid : Message : 72.377074 s : Deo mflop/s per node   24065467
+Grid : Message : 72.624877 s : r_e2.06377678
+Grid : Message : 72.625198 s : r_o2.06381058
+Grid : Message : 72.625507 s : res4.12758736
+Grid : Message : 73.759140 s : norm diff   0
+Grid : Message : 73.868204 s : norm diff even  0
+Grid : Message : 73.907201 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 74.414582 s :  Testing without internode communication 
+Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 74.414586 s : Grid Layout
+Grid : Message : 74.414586 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 74.414594 s : 	OpenMP threads       : 4
+Grid : Message : 74.414595 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 74.679364 s : Initialising 4d RNG
+Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 74.759525 s : Initialising 5d RNG
+Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 119.252016 s : Drawing gauge field
+Grid : Message : 129.919846 s : Random gauge initialised 
+Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 129.923611 s : Setting up Cshift based reference 
+Grid : Message : 135.522878 s : *****************************************************************
+Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 135.522899 s : *****************************************************************
+Grid : Message : 135.522899 s : *****************************************************************
+Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 135.522901 s : * Vectorising space-time by 8
+Grid : Message : 135.522903 s : * VComplex size is 64 B
+Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
+Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 135.522908 s : *****************************************************************
+Grid : Message : 136.151202 s : Called warmup
+Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
+Grid : Message : 137.224748 s : mflop/s =   24755806
+Grid : Message : 137.224751 s : mflop/s per rank =  6188951.49
+Grid : Message : 137.224753 s : mflop/s per node =  24755806
+Grid : Message : 137.235239 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 146.451686 s : ----------------------------------------------------------------
+Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 146.451710 s : ----------------------------------------------------------------
+Grid : Message : 146.451712 s : Called DwDag
+Grid : Message : 146.451714 s : norm dag result 4.12810493
+Grid : Message : 146.452323 s : norm dag ref    4.12810493
+Grid : Message : 146.454799 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 146.940894 s : src_e0.500003185
+Grid : Message : 146.953676 s : src_o0.499996882
+Grid : Message : 146.955927 s : *********************************************************
+Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 146.955932 s : * Vectorising space-time by 8
+Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
+Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 146.955941 s : *********************************************************
+Grid : Message : 147.511975 s : Deo mflop/s =   24036256.5
+Grid : Message : 147.511989 s : Deo mflop/s per rank   6009064.13
+Grid : Message : 147.511991 s : Deo mflop/s per node   24036256.5
+Grid : Message : 147.522100 s : r_e2.06377678
+Grid : Message : 147.522433 s : r_o2.06381058
+Grid : Message : 147.522745 s : res4.12758736
+Grid : Message : 148.229848 s : norm diff   0
+Grid : Message : 149.233474 s : norm diff even  0
+Grid : Message : 149.235815 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 149.960990 s :  Testing without intranode communication 
+Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 149.960995 s : Grid Layout
+Grid : Message : 149.960995 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 149.961003 s : 	OpenMP threads       : 4
+Grid : Message : 149.961004 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 150.155810 s : Initialising 4d RNG
+Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 150.973420 s : Initialising 5d RNG
+Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 193.933765 s : Drawing gauge field
+Grid : Message : 204.611551 s : Random gauge initialised 
+Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 204.615265 s : Setting up Cshift based reference 
+Grid : Message : 210.117788 s : *****************************************************************
+Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 210.117809 s : *****************************************************************
+Grid : Message : 210.117810 s : *****************************************************************
+Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 210.117813 s : * Vectorising space-time by 8
+Grid : Message : 210.117814 s : * VComplex size is 64 B
+Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
+Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 210.117819 s : *****************************************************************
+Grid : Message : 210.714641 s : Called warmup
+Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
+Grid : Message : 211.892252 s : mflop/s =   22568003.2
+Grid : Message : 211.892255 s : mflop/s per rank =  5642000.8
+Grid : Message : 211.892257 s : mflop/s per node =  22568003.2
+Grid : Message : 211.896037 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 220.751375 s : ----------------------------------------------------------------
+Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 220.751409 s : ----------------------------------------------------------------
+Grid : Message : 220.751411 s : Called DwDag
+Grid : Message : 220.751412 s : norm dag result 4.12810493
+Grid : Message : 220.753307 s : norm dag ref    4.12810493
+Grid : Message : 220.755796 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 221.697800 s : src_e0.500003185
+Grid : Message : 221.890920 s : src_o0.499996882
+Grid : Message : 221.913430 s : *********************************************************
+Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 221.913480 s : * Vectorising space-time by 8
+Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
+Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 221.913550 s : *********************************************************
+Grid : Message : 221.645213 s : Deo mflop/s =   24114032
+Grid : Message : 221.645228 s : Deo mflop/s per rank   6028508.01
+Grid : Message : 221.645231 s : Deo mflop/s per node   24114032
+Grid : Message : 221.656021 s : r_e2.06377678
+Grid : Message : 221.656389 s : r_o2.06381058
+Grid : Message : 221.656698 s : res4.12758736
+Grid : Message : 222.110075 s : norm diff   0
+Grid : Message : 222.857692 s : norm diff even  0
+Grid : Message : 222.875763 s : norm diff odd   0
+Grid : Message : 223.598127 s : *******************************************
+Grid : Message : 223.598145 s : ******* Grid Finalize                ******
+Grid : Message : 223.598146 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf.4node.perf
+++ b/systems/Jupiter/benchmarks/dwf.4node.perf
@ -0,0 +1,286 @@
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+SLURM detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 1 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0009:01:00.0
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 16
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 81604378624 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+
+
+
+
+
+
+
+Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.838000 s :  Testing with full communication 
+Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.840000 s : Grid Layout
+Grid : Message : 0.840000 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 0.846000 s : 	OpenMP threads       : 4
+Grid : Message : 0.846000 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 0.165970 s : Initialising 4d RNG
+Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 0.960410 s : Initialising 5d RNG
+Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+local rank 2 device 0 bus id: 0029:01:00.0
+local rank 3 device 0 bus id: 0039:01:00.0
+local rank 1 device 0 bus id: 0019:01:00.0
+Grid : Message : 44.657270 s : Drawing gauge field
+Grid : Message : 55.247733 s : Random gauge initialised 
+Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 55.253053 s : Setting up Cshift based reference 
+Grid : Message : 62.191747 s : *****************************************************************
+Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 62.191768 s : *****************************************************************
+Grid : Message : 62.191769 s : *****************************************************************
+Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 62.191769 s : * Vectorising space-time by 8
+Grid : Message : 62.191770 s : * VComplex size is 64 B
+Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
+Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 62.191772 s : *****************************************************************
+Grid : Message : 62.857568 s : Called warmup
+Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
+Grid : Message : 65.582120 s : mflop/s =   48306525
+Grid : Message : 65.582140 s : mflop/s per rank =  3019157.81
+Grid : Message : 65.582150 s : mflop/s per node =  12076631.3
+Grid : Message : 65.637550 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 75.122153 s : ----------------------------------------------------------------
+Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 75.122167 s : ----------------------------------------------------------------
+Grid : Message : 75.122167 s : Called DwDag
+Grid : Message : 75.122167 s : norm dag result 4.12801829
+Grid : Message : 75.123295 s : norm dag ref    4.12801829
+Grid : Message : 75.125890 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 75.605683 s : src_e0.500004005
+Grid : Message : 75.617824 s : src_o0.499996067
+Grid : Message : 75.620089 s : *********************************************************
+Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 75.620093 s : * Vectorising space-time by 8
+Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
+Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 75.620096 s : *********************************************************
+Grid : Message : 76.732272 s : Deo mflop/s =   48068252.4
+Grid : Message : 76.732283 s : Deo mflop/s per rank   3004265.77
+Grid : Message : 76.732285 s : Deo mflop/s per node   12017063.1
+Grid : Message : 76.749317 s : r_e2.06443136
+Grid : Message : 76.749652 s : r_o2.06378451
+Grid : Message : 76.749955 s : res4.12821587
+Grid : Message : 77.198827 s : norm diff   0
+Grid : Message : 77.981760 s : norm diff even  0
+Grid : Message : 78.455900 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 78.539337 s :  Testing without internode communication 
+Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 78.539339 s : Grid Layout
+Grid : Message : 78.539339 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 78.539347 s : 	OpenMP threads       : 4
+Grid : Message : 78.539348 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 78.798501 s : Initialising 4d RNG
+Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 78.879916 s : Initialising 5d RNG
+Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 124.586264 s : Drawing gauge field
+Grid : Message : 135.338090 s : Random gauge initialised 
+Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 135.341266 s : Setting up Cshift based reference 
+Grid : Message : 142.604280 s : *****************************************************************
+Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 142.604460 s : *****************************************************************
+Grid : Message : 142.604470 s : *****************************************************************
+Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 142.604480 s : * Vectorising space-time by 8
+Grid : Message : 142.604500 s : * VComplex size is 64 B
+Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
+Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 142.604520 s : *****************************************************************
+Grid : Message : 142.686034 s : Called warmup
+Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
+Grid : Message : 144.868559 s : mflop/s =   48706194.1
+Grid : Message : 144.868561 s : mflop/s per rank =  3044137.13
+Grid : Message : 144.868562 s : mflop/s per node =  12176548.5
+Grid : Message : 144.887595 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 153.622978 s : ----------------------------------------------------------------
+Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 153.622995 s : ----------------------------------------------------------------
+Grid : Message : 153.622995 s : Called DwDag
+Grid : Message : 153.622996 s : norm dag result 4.12801829
+Grid : Message : 153.623604 s : norm dag ref    4.12801829
+Grid : Message : 153.626098 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 154.148319 s : src_e0.500004005
+Grid : Message : 154.151454 s : src_o0.499996067
+Grid : Message : 154.153722 s : *********************************************************
+Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 154.153725 s : * Vectorising space-time by 8
+Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
+Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 154.153728 s : *********************************************************
+Grid : Message : 155.200671 s : Deo mflop/s =   51121022.4
+Grid : Message : 155.200682 s : Deo mflop/s per rank   3195063.9
+Grid : Message : 155.200684 s : Deo mflop/s per node   12780255.6
+Grid : Message : 155.217204 s : r_e2.06443136
+Grid : Message : 155.217550 s : r_o2.06378451
+Grid : Message : 155.217869 s : res4.12821587
+Grid : Message : 155.673744 s : norm diff   0
+Grid : Message : 156.463329 s : norm diff even  0
+Grid : Message : 156.878866 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 157.620764 s :  Testing without intranode communication 
+Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 157.620766 s : Grid Layout
+Grid : Message : 157.620766 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 157.620773 s : 	OpenMP threads       : 4
+Grid : Message : 157.620774 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 157.671479 s : Initialising 4d RNG
+Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 157.755651 s : Initialising 5d RNG
+Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 202.465158 s : Drawing gauge field
+Grid : Message : 213.214546 s : Random gauge initialised 
+Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 213.217711 s : Setting up Cshift based reference 
+Grid : Message : 219.662772 s : *****************************************************************
+Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 219.662787 s : *****************************************************************
+Grid : Message : 219.662788 s : *****************************************************************
+Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 219.662789 s : * Vectorising space-time by 8
+Grid : Message : 219.662790 s : * VComplex size is 64 B
+Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
+Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 219.662791 s : *****************************************************************
+Grid : Message : 220.425592 s : Called warmup
+Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
+Grid : Message : 222.536267 s : mflop/s =   50365105.5
+Grid : Message : 222.536269 s : mflop/s per rank =  3147819.09
+Grid : Message : 222.536270 s : mflop/s per node =  12591276.4
+Grid : Message : 222.541053 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 232.135901 s : ----------------------------------------------------------------
+Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 232.135916 s : ----------------------------------------------------------------
+Grid : Message : 232.135917 s : Called DwDag
+Grid : Message : 232.135918 s : norm dag result 4.12801829
+Grid : Message : 232.151938 s : norm dag ref    4.12801829
+Grid : Message : 232.154451 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 232.630529 s : src_e0.500004005
+Grid : Message : 232.643197 s : src_o0.499996067
+Grid : Message : 232.645527 s : *********************************************************
+Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 232.645532 s : * Vectorising space-time by 8
+Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
+Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 232.645535 s : *********************************************************
+Grid : Message : 233.774184 s : Deo mflop/s =   47432091.9
+Grid : Message : 233.774194 s : Deo mflop/s per rank   2964505.74
+Grid : Message : 233.774196 s : Deo mflop/s per node   11858023
+Grid : Message : 233.791552 s : r_e2.06443136
+Grid : Message : 233.791899 s : r_o2.06378451
+Grid : Message : 233.792204 s : res4.12821587
+Grid : Message : 234.230783 s : norm diff   0
+Grid : Message : 235.162780 s : norm diff even  0
+Grid : Message : 235.291950 s : norm diff odd   0
+Grid : Message : 235.765411 s : *******************************************
+Grid : Message : 235.765424 s : ******* Grid Finalize                ******
+Grid : Message : 235.765425 s : *******************************************
+
--- a/systems/Jupiter/benchmarks/dwf1.slurm
+++ b/systems/Jupiter/benchmarks/dwf1.slurm
@ -0,0 +1,57 @@
+#!/bin/sh
+#SBATCH --account=jureap14
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --time=2:00:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap"
+
+source ../sourceme.sh
+
+cat << EOF > bind_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3)
+export NUMA_MAP=(0 1 2 3)
+export NIC_MAP=(0 1 2 3)
+export GPU=\$SLURM_LOCALID
+export NUMA=\$SLURM_LOCALID
+export NIC=\$SLURM_LOCALID
+export CUDA_VISIBLE_DEVICES=\$GPU
+export UCX_NET_DEVICES=mlx5_\${NIC}:1
+
+echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./bind_gpu
+
+srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
+        ./bind_gpu ./Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 1.1.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > dwf.1node.perf
+
+srun --cpu-bind=no -N 1  -n $SLURM_NTASKS \
+	./bind_gpu ./Benchmark_comms_host_device \
+	--mpi 1.1.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > comms.1node.perf
+
+
+
+
--- a/systems/Jupiter/benchmarks/dwf4.slurm
+++ b/systems/Jupiter/benchmarks/dwf4.slurm
@ -0,0 +1,57 @@
+#!/bin/sh
+#SBATCH --account=jureap14
+#SBATCH --nodes=4
+#SBATCH --ntasks=16
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --time=2:00:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap"
+
+source ../sourceme.sh
+
+cat << EOF > bind_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3)
+export NUMA_MAP=(0 1 2 3)
+export NIC_MAP=(0 1 2 3)
+export GPU=\$SLURM_LOCALID
+export NUMA=\$SLURM_LOCALID
+export NIC=\$SLURM_LOCALID
+export CUDA_VISIBLE_DEVICES=\$GPU
+export UCX_NET_DEVICES=mlx5_\${NIC}:1
+
+echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./bind_gpu
+
+srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
+        ./bind_gpu ./Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 64.64.64.64 \
+	--shm 2048 > dwf.4node.perf
+
+srun --cpu-bind=no -N 4  -n $SLURM_NTASKS \
+	./bind_gpu ./Benchmark_comms_host_device \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > comms.4node.perf
+
+
+
+
--- a/systems/Jupiter/config-command
+++ b/systems/Jupiter/config-command
@ -0,0 +1,16 @@
+export CXX=nvcc
+export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
+export LDFLAGS="-cudart shared -L${OPENMPI}/lib" 
+export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
+
+../../configure \
+    --enable-comms=mpi \
+    --enable-simd=GPU \
+    --enable-gen-simd-width=64 \
+    --enable-shm=nvlink \
+    --enable-accelerator=cuda \
+    --with-lime=$CLIME \
+    --disable-gparity \
+    --disable-fermion-reps \
+    --disable-unified 
+
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@ -0,0 +1,10 @@
+CLIME=$HOME/install/
+module load Clang
+module load CUDA
+module load FFTW
+module load OpenSSL
+module load MPFR
+module load NVHPC
+module load UCX
+module load OpenMPI
+ulimit -c 0
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@ -0,0 +1,206 @@
+The purpose of this file is to collate all non-obvious known magic shell variables
+and compiler flags required for either correctness or performance on various systems.
+
+A repository of work-arounds.
+
+Contents:
+1. Interconnect + MPI
+2. Compilation
+3. Profiling
+
+************************
+* 1. INTERCONNECT + MPI
+************************
+
+--------------------------------------------------------------------
+MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
+--------------------------------------------------------------------
+export OMPI_MCA_io=romio321
+
+--------------------------------------
+ROMIO fail with > 2GB per node read (32 bit issue)
+--------------------------------------
+
+Use later MPICH
+
+https://github.com/paboyle/Grid/issues/381
+
+https://github.com/pmodels/mpich/commit/3a479ab0
+
+--------------------------------------------------------------------
+Slingshot: Frontier and Perlmutter libfabric slow down 
+and physical memory fragmentation 
+--------------------------------------------------------------------
+export FI_MR_CACHE_MONITOR=disabled
+or
+export FI_MR_CACHE_MONITOR=kdreg2
+
+--------------------------------------------------------------------
+Perlmutter
+--------------------------------------------------------------------
+
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_IPC_ENABLED=1
+export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
+export MPICH_GPU_NO_ASYNC_MEMCPY=0
+
+--------------------------------------------------------------------
+Frontier/LumiG
+--------------------------------------------------------------------
+
+Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
+
+cat << EOF > select_gpu
+#!/bin/bash
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+chmod +x ./select_gpu
+
+srun ./select_gpu BINARY
+
+
+--------------------------------------------------------------------
+Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
+--------------------------------------------------------------------
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+
+--------------------------------------------------------------------
+Mellanox + A100 correctness (Tursa, Booster, Leonardo)
+--------------------------------------------------------------------
+export UCX_MEMTYPE_CACHE=n
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance 
+--------------------------------------------------------------------
+
+https://github.com/pmodels/mpich/issues/7302
+
+--enable-cuda-aware-mpi=no  
+--enable-unified=no
+
+Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
+Do not use SVM
+
+Ideally use MPICH with fix to issue 7302:
+
+https://github.com/pmodels/mpich/pull/7312
+
+Ideally:
+MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
+
+Alternatives:
+export MPIR_CVAR_NOLOCAL=1
+export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance 
+--------------------------------------------------------------------
+
+Broken:
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+This gives good peformance without requiring 
+--enable-cuda-aware-mpi=no  
+
+But is an open issue reported by James Osborn
+https://github.com/pmodels/mpich/issues/7139
+
+Possibly resolved but unclear if in the installed software yet.
+
+************************
+* 2. COMPILATION
+************************
+
+--------------------------------------------------------------------
+G++ compiler breakage / graveyard
+--------------------------------------------------------------------
+
+9.3.0, 10.3.1, 
+https://github.com/paboyle/Grid/issues/290
+https://github.com/paboyle/Grid/issues/264
+
+Working (-) Broken (X):
+
+4.9.0 -
+4.9.1 -
+5.1.0 X
+5.2.0 X
+5.3.0 X
+5.4.0 X
+6.1.0 X
+6.2.0 X
+6.3.0 -
+7.1.0 -
+8.0.0 (HEAD) -
+
+https://github.com/paboyle/Grid/issues/100
+
+--------------------------------------------------------------------
+AMD GPU nodes :
+--------------------------------------------------------------------
+
+multiple ROCM versions broken; use 5.3.0
+manifests itself as wrong results in fp32 
+
+https://github.com/paboyle/Grid/issues/464
+
+--------------------------------------------------------------------
+Aurora/PVC
+--------------------------------------------------------------------
+
+SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
+SYCL slow link and relocatable code issues (Christoph Lehner)
+Opt large register file required for good performance in fp64
+
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
+
+--------------------------------------------------------------------
+Aurora/PVC useful extra options
+--------------------------------------------------------------------
+
+Host only sanitizer:
+-Xarch_host -fsanitize=leak
+-Xarch_host -fsanitize=address
+
+Deterministic MPI reduction:
+export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+
+
+************************
+* 3. Visual profile tools
+************************
+
+--------------------------------------------------------------------
+Frontier/rocprof
+--------------------------------------------------------------------
+
+--------------------------------------------------------------------
+Aurora/unitrace
+--------------------------------------------------------------------
+
+
+--------------------------------------------------------------------
+Tursa/nsight-sys
+--------------------------------------------------------------------
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@ -1,2 +1,14 @@
-CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug 
+
+
+CXX=mpicxx ../../configure \
+	   --enable-simd=GEN \
+	   --enable-comms=mpi-auto \
+	   --enable-Sp=yes \
+	   --enable-unified=yes \
+	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
+	   --with-lime=$CLIME \
+	   --with-openssl=$OPENSSL \
+	   --with-gmp=$GMP \
+	   --with-mpfr=$MPFR \
+	   --disable-debug 

--- a/systems/sdcc-genoa/bench.slurm
+++ b/systems/sdcc-genoa/bench.slurm
@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=1
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+for t in  2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
+for vol in 48.48.48.96
+do
+srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
+done
+#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
+done
+
--- a/systems/sdcc-genoa/bench2.slurm
+++ b/systems/sdcc-genoa/bench2.slurm
@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=2
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=2
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+nodes=2
+mpi=1.1.1.2
+
+for t in 2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
+
+for vol in 64.64.64.128
+do
+srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
+done
+
+done
+
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@ -0,0 +1,29 @@
+
+spack load c-lime
+spack load fftw
+spack load hdf5+cxx
+
+export FFTW=`spack find --paths fftw       | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
+
+../../configure \
+--enable-comms=mpi-auto \
+--enable-unified=yes \
+--enable-shm=shmopen \
+--enable-shm-fast-path=shmopen \
+--enable-accelerator=none \
+--enable-simd=AVX512 \
+--with-lime=$CLIME \
+--with-hdf5=$HDF5 \
+--with-fftw=$FFTW \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=clang++ \
+MPICXX=mpicxx \
+LIBS=-llime \
+LDFLAGS=-L$CLIME/lib/ \
+CXXFLAGS="-std=c++17 -fPIE"
+
+
+
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@ -0,0 +1,5 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@17.0.4
+export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
+module load openmpi/4.1.8
+spack load c-lime
--- a/systems/spack-linux/config-command
+++ b/systems/spack-linux/config-command
@ -0,0 +1,17 @@
+../../src/Grid/configure \
+    --prefix /home/pab/NPR/install \
+    --enable-comms=mpi-auto \
+    --enable-simd=AVX2 \
+    --enable-shm=none \
+    --enable-debug \
+    --with-lime=$CLIME \
+    --with-hdf5=$HDF5 \
+    --with-fftw=$FFTW \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
+    --disable-gparity \
+    --disable-fermion-reps \
+    CXX=clang++ \
+    MPICXX=mpicxx \
+    CXXFLAGS="-std=c++17 "
+
--- a/systems/spack-linux/sourceme.sh
+++ b/systems/spack-linux/sourceme.sh
@ -0,0 +1,28 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@12
+spack load autoconf%clang@12.0.1
+spack load automake%clang@12.0.1
+spack load c-lime%clang@12.0.1
+spack load fftw%clang@12.0.1
+spack load gmp%clang@12.0.1
+spack load mpfr%clang@12.0.1
+spack load openmpi%clang@12.0.1
+spack load openssl%clang@12.0.1
+spack load hdf5+cxx%clang@12.0.1
+spack load cmake%clang@12.0.1
+export FFTW=`spack find --paths fftw%clang@12.0.1    | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr%clang@12.0.1    | grep ^mpfr  | awk '{print $2}' `
+export LLVM=`spack find --paths llvm@12    | grep ^llvm  | awk '{print $2}' `
+export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk  '{print $2}' `
+export GMP=`spack find --paths gmp%clang@12.0.1      | grep ^gmp | awk '{print $2}' `
+export TCLAP=`spack find --paths tclap%clang@12.0.1  | grep ^tclap | awk '{print $2}' `
+export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
+
+ulimit -s 81920
--- a/systems/spack-linux/spack-install
+++ b/systems/spack-linux/spack-install
@ -0,0 +1,19 @@
+cd
+git clone https://github.com/spack/spack.git
+source $HOME/spack/share/spack/setup-env.sh
+
+spack install llvm@12
+
+spack install autoconf%clang@12.0.1
+spack install automake%clang@12.0.1
+spack install c-lime%clang@12.0.1
+spack install fftw%clang@12.0.1
+spack install gmp%clang@12.0.1
+spack install mpfr%clang@12.0.1
+spack install openmpi%clang@12.0.1
+spack install openssl%clang@12.0.1
+spack install hdf5+cxx%clang@12.0.1
+spack install cmake%clang@12.0.1
+spack install tclap%clang@12.0.1
+spack install emacs%clang@12.0.1
+
--- a/tests/Test_dwf_dslash_repro.cc
+++ b/tests/Test_dwf_dslash_repro.cc
@ -62,7 +62,7 @@ int VerifyOnDevice(const FermionField &res, FermionField &ref)
    if (((random()&0xF)==0)&&injection) {
      uint64_t sF = random()%(NN);
      int lane=0;
-      printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
+      printf("Error injection site %ld on rank %d\n",(long)sF,res.Grid()->ThisRank());
      auto vv = acceleratorGet(res_v[sF]);
      double *dd = (double *)&vv;
      *dd=M_PI;
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@ -195,8 +195,8 @@ int main (int argc, char ** argv)

  int Nk=nrhs;
  int Nm=Nk*3;
-  int Nk=36;
-  int Nm=144;
+  //  int Nk=36;
+  //  int Nm=144;
  int Nstop=Nk;
  int Nconv_test_interval=1;
  
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@ -47,20 +47,20 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
-    std::cout << "Op: PVdag M "<<std::endl;
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
-    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
@ -83,14 +83,14 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
-    std::cout << "Op: PVdag M "<<std::endl;
+    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
-    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
@ -98,7 +98,7 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
-    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
--- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
@ -54,6 +54,7 @@ const RealD            M5       = 1.8;

 int main(int argc, char** argv)
 {
+#ifdef ENABLE_GPARITY
  Grid_init(&argc, &argv);

  int threads = GridThread::GetThreads();
@ -106,6 +107,6 @@ int main(int argc, char** argv)
    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }
-
+#endif
  return 0;
 }
--- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
@ -56,6 +56,7 @@ const RealD            M5       = 1.8;

 int main(int argc, char** argv)
 {
+#ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);

  int threads = GridThread::GetThreads();
@ -106,6 +107,6 @@ int main(int argc, char** argv)
    Meofa.refresh(Umu, sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }
-
+#endif
  return 0;
 }
--- a/tests/debug/Test_padded_cell.cc
+++ b/tests/debug/Test_padded_cell.cc
@ -33,6 +33,7 @@ using namespace std;
 using namespace Grid;

 // This is to optimize the SIMD
+/*
 template<class vobj> void gpermute(vobj & inout,int perm){
  vobj tmp=inout;
  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
@ -40,7 +41,7 @@ template<class vobj> void gpermute(vobj & inout,int perm){
  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
 }
-
+*/

 int main (int argc, char ** argv)
 {
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@ -153,7 +153,7 @@ public:
    t=usecond();
    {
      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
+      auto gStencil_v = gStencil.View(AcceleratorRead);
      autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
      autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
  
@ -389,7 +389,7 @@ public:
    GeneralLocalStencil gStencil(ggrid,shifts);
    {
      autoView( gStaple_v , gStaple, AcceleratorWrite);
-      auto gStencil_v = gStencil.View();
+      auto gStencil_v = gStencil.View(AcceleratorRead);

      typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
      size_t vsize = Nd*sizeof(GaugeViewType);
--- a/tests/debug/Test_reweight_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_reweight_dwf_eofa_gparity.cc
@ -83,6 +83,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)

 int main(int argc, char **argv)
 {
+#ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);

  // Initialize spacetime grid
@ -206,4 +207,5 @@ int main(int argc, char **argv)
  std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;

  Grid_finalize();
+#endif
 }
--- a/tests/debug/Test_reweight_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_reweight_mobius_eofa_gparity.cc
@ -85,6 +85,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)

 int main(int argc, char **argv)
 {
+#ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);

  // Initialize spacetime grid
@ -215,4 +216,5 @@ int main(int argc, char **argv)
  std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;

  Grid_finalize();
+#endif
 }
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@ -35,6 +35,7 @@ using namespace Grid;

 int main (int argc, char ** argv)
 {
+#ifdef ENABLE_GPARITY  
  Grid_init(&argc,&argv);

  Coordinate latt_size   = GridDefaultLatt();
@ -244,4 +245,5 @@ int main (int argc, char ** argv)

  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
+#endif  
 }
--- a/tests/forces/Test_dwf_gpforce_eofa.cc
+++ b/tests/forces/Test_dwf_gpforce_eofa.cc
@ -38,6 +38,7 @@ typedef typename FermionAction::FermionField FermionField;

 int main (int argc, char** argv)
 {
+#ifdef ENABLE_GPARITY  
  Grid_init(&argc, &argv);

  Coordinate latt_size   = GridDefaultLatt();
@ -173,4 +174,5 @@ int main (int argc, char** argv)

  std::cout << GridLogMessage << "Done" << std::endl;
  Grid_finalize();
+#endif
 }
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@ -35,6 +35,7 @@ using namespace Grid;

 int main (int argc, char ** argv)
 {
+#ifdef ENABLE_GPARITY
  Grid_init(&argc,&argv);

  Coordinate latt_size   = GridDefaultLatt();
@ -204,4 +205,5 @@ int main (int argc, char ** argv)
  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
+#endif
 }
--- a/tests/forces/Test_gpdwf_force_1f_2f.cc
+++ b/tests/forces/Test_gpdwf_force_1f_2f.cc
@ -32,6 +32,7 @@ using namespace std;
 using namespace Grid;

 //Here we test the G-parity action and force between the 1f (doubled-lattice) and 2f approaches 
+#ifdef ENABLE_GPARITY


 void copyConjGauge(LatticeGaugeFieldD &Umu_1f, const LatticeGaugeFieldD &Umu_2f, const int nu){
@ -444,3 +445,7 @@ int main (int argc, char ** argv)
    assert(0);
  }
 }
+
+#else
+int main (int argc, char ** argv){};
+#endif
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@ -32,6 +32,7 @@ using namespace Grid;

 int main (int argc, char ** argv)
 {
+#ifdef ENABLE_GPARITY
  Grid_init(&argc,&argv);

  Coordinate latt_size   = GridDefaultLatt();
@ -155,4 +156,5 @@ int main (int argc, char ** argv)

  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
+#endif
 }
--- a/tests/forces/Test_mobius_gparity_eofa_mixed.cc
+++ b/tests/forces/Test_mobius_gparity_eofa_mixed.cc
@ -30,9 +30,10 @@ See the full license in the file "LICENSE" in the top level distribution directo

 #include <Grid/Grid.h>

+#ifdef ENABLE_GPARITY
+
 using namespace std;
 using namespace Grid;
- ;

 typedef GparityWilsonImplD FermionImplPolicyD;
 typedef GparityMobiusEOFAFermionD FermionActionD;
@ -231,3 +232,7 @@ int main (int argc, char** argv)
  std::cout << GridLogMessage << "Done" << std::endl;
  Grid_finalize();
 }
+#else
+int main(int argc,char ** argv) { return 0;};
+
+#endif
--- a/tests/forces/Test_mobius_gpforce_eofa.cc
+++ b/tests/forces/Test_mobius_gpforce_eofa.cc
@ -31,14 +31,14 @@ See the full license in the file "LICENSE" in the top level distribution directo

 using namespace std;
 using namespace Grid;
- ;
+

 typedef GparityWilsonImplR FermionImplPolicy;
 typedef GparityMobiusEOFAFermionD FermionAction;
 typedef typename FermionAction::FermionField FermionField;
-
 int main (int argc, char** argv)
 {
+#ifdef ENABLE_GPARITY
  Grid_init(&argc, &argv);

  Coordinate latt_size   = GridDefaultLatt();
@ -171,4 +171,5 @@ int main (int argc, char** argv)

  std::cout << GridLogMessage << "Done" << std::endl;
  Grid_finalize();
+#endif
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	7aa06329d0	Update for new stencil compression options	2025-06-17 18:06:19 +02:00
Peter Boyle	9d6a38c44c	Compressed comms options as Sloppy	2025-06-17 16:43:53 +02:00
Peter Boyle	6ec5cee368	Preparing for compressed comms	2025-06-17 16:38:10 +02:00
Peter Boyle	f2e9a68825	Simplify	2025-06-13 17:32:05 +02:00
Peter Boyle	d88750e6b6	Sloppy + non-sloppy	2025-06-13 16:42:01 +02:00
Peter Boyle	821358eda7	Remove partial dirichlet. Favour intro reduced prec comms options	2025-06-13 05:08:45 +02:00
Peter Boyle	fce6e1f135	Kill core files for quota reasons	2025-06-13 05:08:15 +02:00
Peter Boyle	8f0bb3e676	remove partial dirichlet	2025-06-13 05:07:56 +02:00
Peter Boyle	262c70d967	USe sloppy comms options	2025-06-13 05:07:23 +02:00
Peter Boyle	da43ef7c2d	REmove partial dirichlet option. It's going nowhere	2025-06-13 05:05:15 +02:00
Peter Boyle	7b60ab5df1	Warning suppress	2025-06-13 05:04:55 +02:00
Peter Boyle	f6b961a64e	Warning suppress	2025-06-13 05:04:47 +02:00
Peter Boyle	f1ed988aa3	Interface to reduced precision comms	2025-06-13 05:04:12 +02:00
Peter Boyle	eea51bb604	Suppress annoying warns	2025-06-13 05:03:36 +02:00
Peter Boyle	9203126aa5	Scripts	2025-06-11 15:30:16 +02:00
Peter Boyle	f90ba4712a	Update for Jupiter	2025-06-11 15:24:34 +02:00
Peter Boyle	3737a24096	Updated python output	2025-06-03 14:09:29 -04:00
Peter Boyle	d418f78352	Making running on Aurora more debuggable	2025-05-23 20:58:16 +00:00
Peter Boyle	25163998a0	Makes SYCL compiler happy	2025-05-23 20:57:11 +00:00
Peter Boyle	dc546aaa4b	Updated config options for BNL cluster	2025-05-13 18:44:47 -04:00
Peter Boyle	5364d580c9	Output chirality, eigenvector density files and python source lego plot	2025-05-13 18:44:47 -04:00
Peter Boyle	2a9a6347e3	Do not require Grid format RNGs and also to the 5Li reporting	2025-05-13 18:44:47 -04:00
Peter Boyle	cfdb56f314	Run measurements at t=0 too	2025-05-13 18:44:46 -04:00
Peter Boyle	b517e88db3	Update README	2025-05-13 16:49:21 -04:00
Peter Boyle	bb317aba8d	Lattice = for sycl	2025-05-13 12:50:58 +00:00
Peter Boyle	644cc6647e	JSON update	2025-05-13 12:50:58 +00:00
Peter Boyle	72397ce23b	SYCL interface change	2025-05-13 12:50:58 +00:00
Peter Boyle	d60a80c098	Fixes and visualisation	2025-04-29 18:04:23 -04:00
Peter Boyle	bb8b6d9d73	Fix	2025-04-29 18:04:04 -04:00
Peter Boyle	677b4cc5b0	Make all tests compile	2025-04-24 20:33:26 -04:00
Peter Boyle	be565ffab6	update mac config command	2025-04-24 14:50:06 -04:00
Peter Boyle	df6120e5f6	CPU compile oops fix	2025-04-24 14:50:06 -04:00
Peter Boyle	21de6f7da8	Merge pull request #477 from lehner/feature/wilson-clover-5d Feature/wilson clover 5d	2025-04-24 14:44:48 -04:00
Peter Boyle	dbe39f9ce0	Merge pull request #471 from edbennett/fix-wflow Shave off rough edges in Wilson flow test	2025-04-24 14:40:31 -04:00
Peter Boyle	ab3de50d5e	Merge pull request #473 from UCL-ARC/gauge_action_deriv WilsonGagueAction deriv	2025-04-24 14:39:10 -04:00
Peter Boyle	c545bd2139	Merge pull request #465 from edbennett/allow-nonsu3-compilation guard against trying to compile SU3-specific code when Nc ≠ 3	2025-04-24 14:35:51 -04:00
Peter Boyle	6a1c64fbdd	Merge pull request #470 from paboyle/specflow Spectral flow, DWF/Mobius kernel measurement	2025-04-24 14:34:33 -04:00
Peter Boyle	b75809ed61	Update README	2025-04-24 14:27:22 -04:00
Peter Boyle	ecaf228e5c	Update README	2025-04-24 14:25:32 -04:00
Peter Boyle	6d015ae8fc	Visualisation tools	2025-04-24 13:47:34 -04:00
Peter Boyle	233150d93f	Bug fix for no accelerator aware MPI, thanks Shuhei for finding it.	2025-04-24 11:40:46 -04:00
Peter Boyle	7af8c77a52	Normalise	2025-04-24 11:37:39 -04:00
Chulwoo Jung	a957e7bfa1	Adding DWF evec Chirality measurement	2025-04-22 22:17:51 +00:00
Chulwoo Jung	cee4c8ce8c	Merge branch 'develop' of https://github.com/paboyle/Grid into specflow	2025-04-18 19:55:36 +00:00
Christoph Lehner	96bf814d8c	Add checkerboarding to 5D compact clover	2025-04-10 23:05:39 +02:00
Christoph Lehner	7ddc422788	CompactWilsonClover5D	2025-04-10 23:05:29 +02:00
Peter Boyle	e652fc2825	Shared Memory test reenabled on every Grid object creation. Const improvements in Accelerator.h	2025-04-07 11:51:40 -04:00
Peter Boyle	a49fa3f8d0	ROCM 6.3.1 appears to work	2025-04-07 11:50:59 -04:00
Peter Boyle	cd452a2f91	Slurm update	2025-04-04 18:40:20 -04:00
Peter Boyle	4f89f603ae	Changes to add back shared memory test on GPU	2025-04-04 18:40:15 -04:00
Peter Boyle	11dc2c5e1d	PVdagM initialise	2025-04-04 18:35:06 -04:00
Peter Boyle	6fec3c15ca	Cleaner printing	2025-04-04 18:35:06 -04:00
Peter Boyle	938c47480f	Updated compile on frontier. Unsatisfactory hacsk	2025-04-04 18:35:06 -04:00
Peter Boyle	3811d19298	Fence	2025-04-04 18:35:06 -04:00
Peter Boyle	83a3ab6b6f	Barrier -- not sure 100% this was needed	2025-04-04 18:35:05 -04:00
Peter Boyle	d66a9af6a3	No compile fix	2025-04-04 18:35:05 -04:00
Peter Boyle	adc90d3a86	NVLINK GET/PUT on cuda aware mpi	2025-04-04 18:35:05 -04:00
Peter Boyle	ebbd015c5c	Deprecate shared memory copy as direction matters on nvidia GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	4ab73b36b2	Deprecate shared memory copy as direction matters on GPU	2025-04-04 18:35:05 -04:00
Peter Boyle	130e07a422	Non hermitian support	2025-04-04 18:35:05 -04:00
Peter Boyle	8f47bb367e	Shifted non herm	2025-04-04 18:35:05 -04:00
Peter Boyle	0c3cb60135	Script update	2025-04-04 18:35:05 -04:00
Peter Boyle	9eae8fca5d	Size outut	2025-04-04 18:35:05 -04:00
Peter Boyle	882a217074	Example of Useful prerequisite installs with spack	2025-03-26 11:28:53 -04:00
Mashy Green	e465fce201	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-03-24 10:12:42 +00:00
Mashy Green	d41542c64b	reverted sp2n test wilsonfundfermiongauge to original	2025-03-24 08:29:15 +00:00
Peter Boyle	199818bd6c	Merge pull request #475 from lehner/feature-aurora Sync with GPT on Aurora	2025-03-13 08:55:55 -04:00
Christoph Lehner	fe66c7ca30	verbosity	2025-03-13 12:49:36 +00:00
Christoph Lehner	e9177e4af3	Blas compatibility	2025-03-13 08:48:23 +00:00
Christoph Lehner	d15a6c5933	Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora	2025-03-13 07:29:55 +00:00
Peter Boyle	25ab9325e7	Use hostVector but remove construct resize	2025-03-11 15:02:32 +00:00
Peter Boyle	19f9378b98	Should work on Aurora nowb	2025-03-11 13:50:43 +00:00
Mashy Green	785bc7a14f	Adding staple zeroing fix	2025-03-10 12:29:04 +00:00
Mashy Green	1a1fe85428	Merge remote-tracking branch 'upstream' into gauge_action_deriv	2025-03-10 08:37:36 +00:00
Mashy Green	0000d2e558	Merge branch 'develop' into gauge_action_deriv	2025-03-10 08:35:57 +00:00
Christoph Lehner	9ffd1ed4ce	Merged	2025-03-08 15:30:08 +00:00
Peter Boyle	3d014864e2	Makinig LLVM happy	2025-03-06 14:19:25 -05:00
Peter Boyle	1d22841811	Working on aurora, GPT issue turned up is fixed	2025-03-06 03:20:18 +00:00
Peter Boyle	a1cdda833f	Update WorkArounds.txt	2025-03-05 14:04:23 -05:00
Peter Boyle	ad6db92690	Update WorkArounds.txt	2025-03-05 14:00:26 -05:00
Peter Boyle	e8ff9d8e50	Update WorkArounds.txt	2025-03-05 14:00:04 -05:00
Peter Boyle	795769c636	Update WorkArounds.txt	2025-03-05 13:50:41 -05:00
Peter Boyle	267a39d943	Update WorkArounds.txt	2025-03-05 13:49:43 -05:00
Peter Boyle	3624bd3d22	Update WorkArounds.txt	2025-03-05 13:45:09 -05:00
Peter Boyle	bc12dbbb38	Update WorkArounds.txt	2025-03-05 12:48:56 -05:00
Peter Boyle	eb8a008a8f	Create WorkArounds.txt	2025-03-05 12:41:59 -05:00
Peter Boyle	c4d9aa1a21	Config command that makes GPT happier	2025-02-27 20:12:49 +00:00
Peter Boyle	6ae809ed40	Print not liked on GPT compile	2025-02-27 20:12:49 +00:00
Peter Boyle	311e2aab3f	Update Accelerator.h	2025-02-26 11:42:52 -05:00
Peter Boyle	438dfbdb83	Only throw if there is a pending list entry in CommsComplete	2025-02-25 16:57:27 +00:00
Peter Boyle	b2ce760cf4	Verbose issue with GPT	2025-02-25 16:55:23 +00:00
Muhammad Asif	b1ba209696	Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22 ) Co-authored-by: Mashy Green <mashy@me.com> merging no-su3 patch	2025-02-24 11:38:42 +00:00
Muhammad Asif	cb3e529b1e	Merge branch 'paboyle:develop' into develop	2025-02-24 11:29:09 +00:00
Mashy Green	717f647418	added the WilsonFlow patch from upstream PR #471	2025-02-24 08:41:31 +00:00
Mashy Green	98e7418187	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv	2025-02-24 08:33:05 +00:00
Mashy Green	fe05bf48b1	Improvements to WilsonGaugeAction deriv function (#16 ) * patched version + modifications to deriv -> staple in qcd/gauge * Cleaning up and aligning variable naming between action deriv versions * Removing the regresion test files that were also in this branch for a clean PR * Reverting whitespace changes * Fixing after revering too much! --------- Co-authored-by: Mashy Green <mashy@me.com>	2025-02-17 18:52:04 +00:00
Mashy Green	d2dd8f54e2	Fixing after revering too much!	2025-02-17 17:32:27 +00:00
Mashy Green	7726ee4b16	Reverting whitespace changes	2025-02-17 17:16:28 +00:00
Mashy Green	355ec76257	Merge pull request #18 from UCL-ARC/bugfix/nvtx Bugfix/nvtx	2025-02-03 11:05:42 +00:00
Christoph Lehner	84cab5e6e7	no comms and log cleanup	2025-02-01 16:37:21 +01:00
Mashy Green	4f17c8d081	Merge branch 'paboyle:develop' into bugfix/nvtx	2025-01-29 13:10:12 +00:00
Mashy Green	aaab753982	Reverting to older version of nvtx for Tursa support	2025-01-29 12:57:38 +00:00
Chulwoo Jung	570b72a47b	Bugfix. Sorry!	2025-01-21 15:37:39 -05:00
Chulwoo Jung	a5798a89ed	Merge branch 'develop' into specflow	2025-01-21 12:13:24 -05:00
Chulwoo Jung	f7e2f9a401	Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement	2025-01-16 20:47:33 +00:00
Chulwoo Jung	2848a9b558	DWF Kernel lanczos working(?)	2025-01-16 01:29:56 +00:00
Mashy Green	d4868991af	Fixed wrong lib for NVTX in configure.ac and updated to nvtx3	2025-01-10 14:53:19 +00:00
Mashy Green	e99d42404e	Removing the regresion test files that were also in this branch for a clean PR	2024-12-16 16:31:22 +00:00
Mashy Green	3ba019c747	Cleaning up and aligning variable naming between action deriv versions	2024-12-03 15:23:00 +00:00
Mashy Green	47429218bb	patched version + modifications to deriv -> staple in qcd/gauge	2024-11-27 16:29:22 +00:00
Ed Bennett	8d305df0db	guard against trying to compile SU3-specific code when Nc ≠ 3	2024-05-24 14:00:56 +01:00
				`@ -0,0 +1 @@`
				`../CompactWilsonCloverFermion5DInstantiation.cc.master`