specflow evec I/O added,

Merge branch 'develop' of https://github.com/paboyle/Grid into ic2
Checking in for merging
2025-08-01 12:17:06 +01:00 · 2025-07-11 15:57:23 -04:00 · 2025-04-25 10:48:41 -04:00 · 2025-04-25 10:48:22 -04:00
64 changed files with 1352 additions and 2583 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -51,13 +51,11 @@ directory
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress declared_but_not_referenced
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 #pragma diag_suppress declared_but_not_referenced
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -269,9 +269,7 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
    grid->Barrier();
    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -65,7 +65,6 @@ NAMESPACE_BEGIN(Grid);
 #endif
 enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
 enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
 class GridBLAS {
 public:
@@ -98,21 +97,7 @@ public:
      gridblasInit=1;
    }
  }
-
+  
 #ifdef GRID_CUDA
  cublasComputeType_t toDataType(GridBLASPrecision_t p) {
    switch (p) {
    case GridBLAS_PRECISION_16F:
      return CUBLAS_COMPUTE_32F_FAST_16F;
    case GridBLAS_PRECISION_16BF:
      return CUBLAS_COMPUTE_32F_FAST_16BF;
    case GridBLAS_PRECISION_TF32:
      return CUBLAS_COMPUTE_32F_FAST_TF32;
    default:
      assert(0);
    }
  }
 #endif
  // Force construct once
  GridBLAS() { Init(); };
  ~GridBLAS() { };
@@ -153,10 +138,8 @@ public:
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn,
+		   deviceVector<ComplexD*> &Cmn)
 		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
  {
    assert(precision == GridBLAS_PRECISION_DEFAULT);
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
@@ -218,10 +201,8 @@ public:
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn,
+		   deviceVector<ComplexD*> &Cmn)
 		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
  {
    assert(precision == GridBLAS_PRECISION_DEFAULT);
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(Bkn.size()==batchCount);
@@ -467,8 +448,7 @@ public:
 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexF*> &Bkn,
 		   ComplexF beta,
-		   deviceVector<ComplexF*> &Cmn,
+		   deviceVector<ComplexF*> &Cmn)
 		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
@@ -493,7 +473,6 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
    assert(precision == GridBLAS_PRECISION_DEFAULT);
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -524,67 +503,50 @@ public:
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
-    cublasStatus_t err;
+    auto err = cublasCgemmBatched(gridblasHandle,
-    if (precision == GridBLAS_PRECISION_DEFAULT) {
+				  hOpA,
-      err = cublasCgemmBatched(gridblasHandle,
+				  hOpB,
-			       hOpA,
+				  m,n,k,
-			       hOpB,
+				  (cuComplex *) &alpha_p[0],
-			       m,n,k,
+				  (cuComplex **)&Amk[0], lda,
-			       (cuComplex *) &alpha_p[0],
+				  (cuComplex **)&Bkn[0], ldb,
-			       (cuComplex **)&Amk[0], lda,
+				  (cuComplex *) &beta_p[0],
-			       (cuComplex **)&Bkn[0], ldb,
+				  (cuComplex **)&Cmn[0], ldc,
-			       (cuComplex *) &beta_p[0],
+				  batchCount);
 			       (cuComplex **)&Cmn[0], ldc,
 			       batchCount);
    } else {
      cublasComputeType_t compute_precision = toDataType(precision);
      err = cublasGemmBatchedEx(gridblasHandle,
 				hOpA,
 				hOpB,
 				m,n,k,
 				(void *) &alpha_p[0],
 				(void **)&Amk[0], CUDA_C_32F, lda,
 				(void **)&Bkn[0], CUDA_C_32F, ldb,
 				(void *) &beta_p[0],
 				(void **)&Cmn[0], CUDA_C_32F, ldc,
 				batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
    }
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    assert(precision == GridBLAS_PRECISION_DEFAULT);
+      int64_t m64=m;
-    int64_t m64=m;
+      int64_t n64=n;
-    int64_t n64=n;
+      int64_t k64=k;
-    int64_t k64=k;
+      int64_t lda64=lda;
-    int64_t lda64=lda;
+      int64_t ldb64=ldb;
-    int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
-    int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
-    int64_t batchCount64=batchCount;
+
-    
+      oneapi::mkl::transpose iOpA;
-    oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
-    oneapi::mkl::transpose iOpB;
+      
-    
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
-    if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
-    if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
-    if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
-    if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
-    if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
-    if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
-    
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
-						&iOpA,
+						  &iOpB,
-						&iOpB,
+						  &m64,&n64,&k64,
-						&m64,&n64,&k64,
+						  (ComplexF *) &alpha_p[0],
-						(ComplexF *) &alpha_p[0],
+						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
-						(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
-						(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexF *) &beta_p[0],
-						(ComplexF *) &beta_p[0],
+						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
-						(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
 						(int64_t)1,&batchCount64,std::vector<sycl::event>());
    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    assert(precision == GridBLAS_PRECISION_DEFAULT);
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
@@ -984,336 +946,6 @@ public:
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
  /*
    Inverse and Determinant
    - CPU version uses Eigen
    - GPU version uses LAPACK-compatible getrf / getri
    Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
                    Overhead to go through getrf / getri for CPU version too large.
 		    Current interface therefore only guarantees the inverse and determinant
 		    functions on all platforms but not the getrf / getri ones.
  */
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
  void inverseBatched(int64_t n,
 		      deviceVector<ComplexD*> &Ann,
 		      deviceVector<ComplexD*> &Cnn) {
    int64_t batchCount = Ann.size();
    assert(batchCount == Cnn.size());
    thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
 	Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
 	eCnn = eAnn.inverse();
      });
  }
  void inverseBatched(int64_t n,
 		      deviceVector<ComplexF*> &Ann,
 		      deviceVector<ComplexF*> &Cnn) {
    int64_t batchCount = Ann.size();
    assert(batchCount == Cnn.size());
    thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
 	Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
 	eCnn = eAnn.inverse();
      });
  }
  void determinantBatched(int64_t n,
 			  deviceVector<ComplexD*> &Ann,
 			  deviceVector<ComplexD*> &C) {
    int64_t batchCount = Ann.size();
    assert(batchCount == C.size());
    thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
 	*C[p] = eAnn.determinant();
      });
  }
  void determinantBatched(int64_t n,
 			  deviceVector<ComplexF*> &Ann,
 			  deviceVector<ComplexF*> &C) {
    int64_t batchCount = Ann.size();
    assert(batchCount == C.size());
    thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
 	*C[p] = eAnn.determinant();
      });
  }
 #else
 #ifdef GRID_SYCL
  template<typename T>
  void getrfBatchedSYCL(int64_t n,
 			deviceVector<T*> &Ann,
 			deviceVector<int64_t> &ipiv,
 			deviceVector<int64_t> &info) {
    int64_t batchCount = Ann.size();
    static deviceVector<T> scratchpad;
    int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
    if (sp_size > scratchpad.size())
      scratchpad.resize(sp_size);
    static deviceVector<int64_t*> _ipiv;
    if (batchCount > _ipiv.size())
      _ipiv.resize(batchCount);
    int64_t** p_ipiv = &_ipiv[0];
    int64_t* pipiv = &ipiv[0];
    accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
    oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
 				    &n, &n,
 				    (T **)&Ann[0],
 				    &n,
 				    (int64_t**)&_ipiv[0],
 				    (int64_t)1, &batchCount,
 				    (T*)&scratchpad[0], (int64_t)scratchpad.size(),
 				    std::vector<sycl::event>());
    synchronise();
  }
 #endif
  void getrfBatched(int64_t n,
 		    deviceVector<ComplexD*> &Ann,
 		    deviceVector<int64_t> &ipiv,
 		    deviceVector<int64_t> &info)
  {
    int64_t batchCount = Ann.size();
    assert(ipiv.size()==batchCount*n);
    assert(info.size()==batchCount);
 #ifdef GRID_HIP
    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
 				    (hipblasDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
 				   (cuDoubleComplex **)&Ann[0], (int)n,
 				   (int*) &ipiv[0],
 				   (int*) &info[0],
 				   (int)batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    getrfBatchedSYCL(n, Ann, ipiv, info);
 #endif
  }
  void getrfBatched(int64_t n,
 		    deviceVector<ComplexF*> &Ann,
 		    deviceVector<int64_t> &ipiv,
 		    deviceVector<int64_t> &info)
  {
    int64_t batchCount = Ann.size();
    assert(ipiv.size()==batchCount*n);
    assert(info.size()==batchCount);
 #ifdef GRID_HIP
    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
 				    (hipblasComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
 				   (cuComplex **)&Ann[0], (int)n,
 				   (int*) &ipiv[0],
 				   (int*) &info[0],
 				   (int)batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    getrfBatchedSYCL(n, Ann, ipiv, info);
 #endif
  }
 #ifdef GRID_SYCL
  template<typename T>
  void getriBatchedSYCL(int64_t n,
 			deviceVector<T*> &Ann,
 			deviceVector<int64_t> &ipiv,
 			deviceVector<int64_t> &info,
 			deviceVector<T*> &Cnn) {
    int64_t batchCount = Ann.size();
    static deviceVector<T> scratchpad;
    int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
    if (sp_size > scratchpad.size())
      scratchpad.resize(sp_size);
    static deviceVector<int64_t*> _ipiv;
    if (batchCount > _ipiv.size())
      _ipiv.resize(batchCount);
    int64_t** p_ipiv = &_ipiv[0];
    int64_t* pipiv = &ipiv[0];
    accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
    oneapi::mkl::lapack::getri_batch(*gridblasHandle,
 				     &n,
 				     (T **)&Ann[0],
 				     &n,
 				     (int64_t**)p_ipiv,
 				     (int64_t)1, &batchCount,
 				     (T *)&scratchpad[0], (int64_t)scratchpad.size(),
 				     std::vector<sycl::event>());
    synchronise();
    T** pA = &Ann[0];
    T** pC = &Cnn[0];
    accelerator_for(i, batchCount*n*n, 1, {
 	auto j = i / batchCount;
 	auto k = i % batchCount;
 	pC[k][j] = pA[k][j];
      });
  }
 #endif
  void getriBatched(int64_t n,
 		    deviceVector<ComplexD*> &Ann,
 		    deviceVector<int64_t> &ipiv,
 		    deviceVector<int64_t> &info,
 		    deviceVector<ComplexD*> &Cnn)
  {
    int64_t batchCount = Ann.size();
    assert(ipiv.size()==batchCount*n);
    assert(info.size()==batchCount);
    assert(Cnn.size()==batchCount);
 #ifdef GRID_HIP
    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
 				    (hipblasDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    auto err = cublasZgetriBatched(gridblasHandle, (int)n,
 				   (cuDoubleComplex **)&Ann[0], (int)n,
 				   (int*) &ipiv[0],
 				   (cuDoubleComplex **)&Cnn[0], (int)n,
 				   (int*) &info[0],
 				   (int)batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
 #endif
  }
  void getriBatched(int64_t n,
 		    deviceVector<ComplexF*> &Ann,
 		    deviceVector<int64_t> &ipiv,
 		    deviceVector<int64_t> &info,
 		    deviceVector<ComplexF*> &Cnn)
  {
    int64_t batchCount = Ann.size();
    assert(ipiv.size()==batchCount*n);
    assert(info.size()==batchCount);
    assert(Cnn.size()==batchCount);
 #ifdef GRID_HIP
    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
 				    (hipblasComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (hipblasComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    auto err = cublasCgetriBatched(gridblasHandle, (int)n,
 				   (cuComplex **)&Ann[0], (int)n,
 				   (int*) &ipiv[0],
 				   (cuComplex **)&Cnn[0], (int)n,
 				   (int*) &info[0],
 				   (int)batchCount);
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
 #endif
  }
  template<typename dtype>
  void inverseBatched(int64_t n,
 		      deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
 		      deviceVector<dtype*> &Cnn  // this will be overwritten with the inverse
 		      ) {
    int64_t batchCount = Ann.size();
    RealD t0 = usecond();
    deviceVector<int64_t> ipiv(batchCount*n);
    deviceVector<int64_t> info(batchCount);
    //RealD t1 = usecond();
    getrfBatched(n, Ann, ipiv, info);
    // test info for non-invertibility?  set to nan if yes?
    getriBatched(n, Ann, ipiv, info, Cnn);
    //synchronise();
    //RealD t2 = usecond();
    //std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1  << std::endl;
  }
  template<typename dtype>
  void determinantBatched(int64_t n,
 			  deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
 			  deviceVector<dtype*> &C    // this will be overwritten with determinant
 			  ) {
    int64_t batchCount = Ann.size();
    //RealD t0 = usecond();
    deviceVector<int64_t> ipiv(batchCount*n);
    deviceVector<int64_t> info(batchCount);
    dtype** pAnn = (dtype**)&Ann[0];
    dtype** pC = (dtype**)&C[0];
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    int* pipiv = (int*)&ipiv[0];
 #else
    int64_t* pipiv = (int64_t*)&ipiv[0];
 #endif
    //RealD t1 = usecond();
    getrfBatched(n, Ann, ipiv, info);
    //RealD t2 = usecond();
    accelerator_for(i,batchCount,1,{
 	dtype det = 1.0;
 	for (int64_t j=0;j<n;j++) {
 	  det *= pAnn[i][n*j + j];
 	  // branchless signs
 	  det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
 	}
 	*pC[i] = det;
      });
    //RealD t3 = usecond();
    //std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1  << "final" << t3 - t2 << std::endl;
  }
 #endif
  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -183,7 +183,6 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  int IsOffNode(int rank);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@@ -202,9 +201,9 @@ public:
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,void *xmit_comp,
+				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,void *recv_comp,
+				    void *recv,
 				    int recv_from_rank,int do_recv,
 				    int xbytes,int rbytes,int dir);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -32,10 +32,6 @@ NAMESPACE_BEGIN(Grid);
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 #ifdef GRID_CHECKSUM_COMMS
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
 uint64_t checksum_index = 1;
 #endif
 ////////////////////////////////////////////
 // First initialise of comms system
@@ -264,39 +260,32 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
  FlightRecorder::StepLog("AllReduce float");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  FlightRecorder::StepLog("AllReduce double");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  FlightRecorder::StepLog("AllReduce uint32_t");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  FlightRecorder::StepLog("AllReduce uint64_t");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -305,31 +294,26 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  FlightRecorder::StepLog("GlobalXOR");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
  FlightRecorder::StepLog("GlobalMax");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
  FlightRecorder::StepLog("GlobalMax");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  FlightRecorder::StepLog("GlobalSumVector(float *)");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  FlightRecorder::StepLog("GlobalSumVector(double *)");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -404,16 +388,11 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-int CartesianCommunicator::IsOffNode(int rank)
+
 {
  int grank = ShmRanks[rank];
  if ( grank == MPI_UNDEFINED ) return true;
  else return false;
 }
 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -428,9 +407,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,void *xmit_comp,
+							 void *xmit,
 							 int dest,int dox,
-							 void *recv,void *recv_comp,
+							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -454,8 +433,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-      //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
      off_node_bytes+=rbytes;
@@ -464,7 +442,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    else { 
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
    }
 #endif
@@ -473,7 +450,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
      off_node_bytes+=xbytes;
@@ -572,11 +549,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   * - post device - host send buffer transfer asynch
   */
 #ifdef GRID_CHECKSUM_COMMS
  rbytes += 8;
  xbytes += 8;
 #endif
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
@@ -589,7 +561,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
      srq.req        = rrq;
      srq.host_buf   = host_recv;
      srq.device_buf = recv;
      srq.tag        = tag;
      list.push_back(srq);
      off_node_bytes+=rbytes;
    }
@@ -603,16 +574,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
      host_xmit = this->HostBufferMalloc(xbytes);
      CommsRequest_t srq;
 #ifdef GRID_CHECKSUM_COMMS
      uint64_t xbytes_data = xbytes - 8;
      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
      assert(xbytes % 8 == 0);
      // flip one bit so that a zero buffer is not consistent
      uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag); 
      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
 #else
      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
 #endif
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
@@ -654,11 +616,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ
 	if ( flag ) {
 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
 #ifdef GRID_CHECKSUM_COMMS
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
 #else
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
 #endif
 	  list[idx].PacketType=InterNodeReceiveHtoD;
 	} else {
 	  pending ++;
@@ -711,9 +669,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,void *xmit_comp,
+							 void *xmit,
 							 int dest,int dox,
-							 void *recv,void *recv_comp,
+							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -821,40 +779,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
  //    }
  //  }
-#ifdef GRID_CHECKSUM_COMMS
+  
  for(int r=0;r<list.size();r++){
    if ( list[r].PacketType == InterNodeReceiveHtoD ) {
      uint64_t rbytes_data = list[r].bytes - 8;
      uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
      uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
      if (expected_cs != computed_cs) {
 	// TODO: error message, backtrace, quit
 	fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
 	fprintf(stderr, " processor = %d\n", (int)_processor);
 	for(int d=0;d<_processors.size();d++)
 	  fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
 	fprintf(stderr, " hostname: %s\n", GridHostname());
 	fprintf(stderr, " expected_cs: %ld\n", expected_cs);
 	fprintf(stderr, " computed_cs: %ld\n", computed_cs);
 	fprintf(stderr, " dest: %d\n", list[r].dest);
 	fprintf(stderr, " tag: %d\n", list[r].tag);
 	fprintf(stderr, " commdir: %d\n", list[r].commdir);
 	fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
 	fflush(stderr);
 	// backtrace
 	int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
 	backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
 	exit(1);
      }
    }
  }
  checksum_index += 1;
 #endif
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
@@ -869,7 +794,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 void CartesianCommunicator::StencilBarrier(void)
 {
  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -877,13 +801,11 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@@ -897,13 +819,11 @@ int CartesianCommunicator::RankWorld(void){
  return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
  FlightRecorder::StepLog("BarrierWorld");
  int ierr = MPI_Barrier(communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@@ -926,7 +846,6 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -124,8 +124,6 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }
 int CartesianCommunicator::IsOffNode(int rank) { return false; }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -43,6 +43,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
 #else
 #ifdef HAVE_NUMAIF_H
  #warning " Using NUMAIF "
 #include <numaif.h>
 #endif 
 #endif 
 #include <syscall.h>
 #endif
@@ -539,21 +543,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
  HostCommBuf= acceleratorAllocHost(bytes);
 #else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
 #if 0
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
  if(numa_name) {
    unsigned long page_size = sysconf(_SC_PAGESIZE);
    numa = atoi(numa_name);
    unsigned long page_count = bytes/page_size;
    std::vector<void *> pages(page_count);
    std::vector<int>    nodes(page_count,numa);
    std::vector<int>    status(page_count,-1);
    for(unsigned long p=0;p<page_count;p++){
      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
    }
    int ret = move_pages(0,
 			 page_count,
 			 &pages[0],
 			 &nodes[0],
 			 &status[0],
 			 MPOL_MF_MOVE);
    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
    if (ret) perror(" move_pages failed for reason:");
  }
 #endif  
  acceleratorPin(HostCommBuf,bytes);
 #endif  
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << Mheader " acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
-  if ( WorldRank == 0 ){
+  std::cout<< "Setting up IPC"<<std::endl;
    std::cout<< Mheader "Setting up IPC"<<std::endl;
  }
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -584,6 +616,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
@@ -642,12 +676,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef SHM_SOCKETS
      myfd=UnixSockets::RecvFileDescriptor();
 #else
-      //      std::cout<<"mapping seeking remote pid/fd "
+      std::cout<<"mapping seeking remote pid/fd "
-      //	       <<handle.pid<<"/"
+	       <<handle.pid<<"/"
-      //	       <<handle.fd<<std::endl;
+	       <<handle.fd<<std::endl;
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
-      //      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
+      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
      myfd  = syscall(438,pidfd,handle.fd,0);
      int err_t = errno;
@@ -657,7 +691,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	assert(0);
      }
 #endif
-      //      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -666,6 +700,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
      }
      assert(thisBuf!=nullptr);
    }
@@ -746,7 +783,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    WorldShmCommBufs[r] =ptr;
    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
@@ -954,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif
-  //  SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -1003,13 +1039,11 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
    return (void *) remote;
  }
 }
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
-//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-//{
+{
-//  acceleratorCopyToDevice(src,dest,bytes);
+  acceleratorCopyToDevice(src,dest,bytes);
-//}
+}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
-    autoView( rhs_v, rhs, AcceleratorWriteDiscard);
+    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
-    autoView( rhs_v , rhs, AcceleratorWriteDiscard);
+    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
 	int b = nn/e1;
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  {
    auto table = MapCshiftTable();
    autoView(rhs_v , rhs, AcceleratorRead);
-    autoView(lhs_v , lhs, AcceleratorWriteDiscard);
+    autoView(lhs_v , lhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -29,12 +29,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_
 NAMESPACE_BEGIN(Grid); 
 #ifdef GRID_CHECKSUM_COMMS
 extern uint64_t checksum_index;
 #endif
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
@@ -130,9 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
-  int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
-  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
 #endif
  int cb= (cbmask==0x2)? Odd : Even;
@@ -148,11 +143,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      FlightRecorder::StepLog("Cshift_Copy_plane");
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
      FlightRecorder::StepLog("Cshift_Copy_plane_complete");
    } else {
      int words = buffer_size;
@@ -160,11 +153,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int bytes = words * sizeof(vobj);
      FlightRecorder::StepLog("Cshift_Gather_plane");
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
      tgather+=usecond();
      FlightRecorder::StepLog("Cshift_Gather_plane_complete");
      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -175,7 +166,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tcomms-=usecond();
      grid->Barrier();
      FlightRecorder::StepLog("Cshift_SendRecv");
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
@@ -185,46 +175,17 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 #else
      // bouncy bouncy
      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
 #ifdef GRID_CHECKSUM_COMMS
      assert(bytes % 8 == 0);
      checksum_index++;
      uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
      *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
      bytes += 8;
 #endif
      grid->SendToRecvFrom((void *)&hsend_buf[0],
 			   xmit_to_rank,
 			   (void *)&hrecv_buf[0],
 			   recv_from_rank,
 			   bytes);
 #ifdef GRID_CHECKSUM_COMMS
      bytes -= 8;
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
      uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
      uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
      std::cout << GridLogComms<< " Cshift: "
 		<<" dim"<<dimension
 		<<" shift "<<shift
 		<< " rank "<< grid->ThisRank()
 		<<" Coor "<<grid->ThisProcessorCoor()
 		<<" send "<<xsum<<" to   "<<xmit_to_rank
 		<<" recv "<<computed_cs<<" from "<<recv_from_rank
 		<<std::endl;
      assert(expected_cs == computed_cs);
 #else
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
 #endif
      FlightRecorder::StepLog("Cshift_SendRecv_complete");
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
      FlightRecorder::StepLog("Cshift_barrier_complete");
      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -288,16 +249,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    recv_buf_extract[s].resize(buffer_size);
  }
 #ifndef ACCELERATOR_AWARE_MPI
-#ifdef GRID_CHECKSUM_COMMS
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
-  buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
 #ifdef GRID_CHECKSUM_COMMS
  buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
 #endif
 #endif
  int bytes = buffer_size*sizeof(scalar_object);
@@ -360,37 +313,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 #else
      // bouncy bouncy
 	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
 #ifdef GRID_CHECKSUM_COMMS
 	assert(bytes % 8 == 0);
 	checksum_index++;
 	uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
 	*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
 	bytes += 8;
 #endif
 	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
 			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
 #ifdef GRID_CHECKSUM_COMMS
 	bytes -= 8;
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 	uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
 	uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
 	std::cout << GridLogComms<< " Cshift_comms_simd: "
 		<<" dim"<<dimension
 		<<" shift "<<shift
 		<< " rank "<< grid->ThisRank()
 		<<" Coor "<<grid->ThisProcessorCoor()
 		<<" send "<<xsum<<" to   "<<xmit_to_rank
 		<<" recv "<<computed_cs<<" from "<<recv_from_rank
 		<<std::endl;
 	assert(expected_cs == computed_cs);
 #else
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
 #endif
 #endif
 	xbytes+=bytes;
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 1
+#if 0
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    assert(ok);
  }
  FlightRecorder::StepLog("Start global sum");
-  grid->GlobalSumP2P(nrm);
+  //  grid->GlobalSumP2P(nrm);
-  //  grid->GlobalSum(nrm);
+  grid->GlobalSum(nrm);
  FlightRecorder::StepLog("Finished global sum");
  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -87,25 +87,6 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
  theGridAccelerator->wait();
  return ret;
 }
 template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
 {
  Word identity;  identity=0;
  Word ret = 0;
  { 
    sycl::buffer<Word, 1> abuff(&ret, {1});
    theGridAccelerator->submit([&](sycl::handler &cgh) {
      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
      cgh.parallel_for(sycl::range<1>{L},
                       Reduction,
                       [=] (sycl::id<1> index, auto &sum) {
 			 auto l = index % 61;
                         sum ^= vec[index]<<l | vec[index]>>(64-l);
                       });
    });
  }
  theGridAccelerator->wait();
  return ret;
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -106,47 +106,6 @@ public:
  }
 };
 #ifdef GRID_LOG_VIEWS
 // Little autoscope assister
 template<class View> 
 class ViewCloser
 {
  View v;  // Take a copy of view and call view close when I go out of scope automatically
  const char* filename; int line, mode;
 public:
  ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
    v(_v), filename(_filename), line(_line), mode(_mode) {
    switch (mode){
    case AcceleratorRead:
    case AcceleratorWrite:
    case CpuRead:
    case CpuWrite:
      ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
      break;
    } 
  };
  ~ViewCloser() {
    switch (mode) {
    case AcceleratorWriteDiscard:
    case AcceleratorWrite:
    case CpuWrite:
      ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
      break;
    }
    v.ViewClose();
  }
 };
 #define autoView(l_v,l,mode)				\
 	  auto l_v = l.View(mode);			\
 	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
 #else
 // Little autoscope assister
 template<class View> 
 class ViewCloser
@@ -160,7 +119,6 @@ class ViewCloser
 #define autoView(l_v,l,mode)				\
 	  auto l_v = l.View(mode);			\
 	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
 #endif
 /////////////////////////////////////////////////////////////////////////////////////////
 // Lattice expression types used by ET to assemble the AST
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,7 +69,6 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogComms      (1, "Comms",  GridLogColours, "BLUE");
 GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -85,7 +84,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogDslash.Active(0);
  GridLogComms.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
@@ -99,7 +97,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
    if (logstreams[i] == std::string("Comms"))       GridLogComms.Active(1);
    if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))       GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -180,7 +180,6 @@ extern GridLogger GridLogError;
 extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug;
 extern GridLogger GridLogComms;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -154,12 +154,6 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -179,12 +179,6 @@ public:
  StencilImpl Stencil; 
  StencilImpl StencilEven; 
  StencilImpl StencilOdd; 
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -146,12 +146,6 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -32,6 +32,209 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////
 // Wilson compressor will need FaceGather policies for:
 // Periodic, Dirichlet, and partial Dirichlet for DWF
 ///////////////////////////////////////////////////////////////
 const int dwf_compressor_depth=2;
 #define DWF_COMPRESS
 class FaceGatherPartialDWF
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
 				   int off,int so,int partial)
  {
    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
    //  Shrinks local and remote comms buffers
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else 
    int depth=Ls/2;
 #endif
    std::pair<int,int> *table_v = & table[0];
    auto rhs_v = rhs.View(AcceleratorRead);
    int vol=table.size()/Ls;
    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
 	Integer i=idx/Ls;
 	Integer s=idx%Ls;
 	Integer sc=depth+s-(Ls-depth);
 	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
 	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
    });
    rhs_v.ViewClose();
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    auto Ls = dd.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth=Ls/2;
 #endif    
    // Just pass in the Grid
    auto kp = dd.kernel_p;
    auto mp = dd.mpi_p;
    int size= dd.buffer_size;
    int vol= size/Ls;
    accelerator_forNB(o,size,1,{
 	int idx=o/Ls;
 	int   s=o%Ls;
 	if ( s < depth ) {
 	  int oo=s*vol+idx;
 	  kp[o]=mp[oo];
 	} else if ( s >= Ls-depth ) {
 	  int sc = depth + s - (Ls-depth);
 	  int oo=sc*vol+idx; 
 	  kp[o]=mp[oo];
 	} else {
 	  kp[o] = Zero();//fill rest with zero if partial dirichlet
 	}
    });
  }
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Need to gather *interior portions* for ALL s-slices in simd directions
  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    // insertion of zeroes...
    assert( (table.size()&0x1)==0);
    int num=table.size()/2;
    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
    auto rhs_v = rhs.View(AcceleratorRead);
    auto p0=&pointers[0][0];
    auto p1=&pointers[1][0];
    auto tp=&table[0];
    int nnum=num/Ls;
    accelerator_forNB(j, num, vobj::Nsimd(), {
 	//  Reorders both local and remote comms buffers
 	//  
 	int s  = j % Ls;
 	int sp1 = (s+depth)%Ls;  // peri incremented s slice
 	int hxyz= j/Ls;
 	int xyz0= hxyz*2; // xyzt part of coor
 	int xyz1= hxyz*2+1;
 	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
 	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
 	compress.CompressExchange(p0[jj],p1[jj],
 				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
 				  rhs_v[so+tp[kk1 ].second], 
 				  type);
    });
    rhs_v.ViewClose();
  }
  // Merge routine is for SIMD faces
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    auto Ls = mm.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
    auto mp = &mm.mpointer[0];
    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
    auto vp1= &mm.vpointers[1][0];
    auto type= mm.type;
    int nnum = num/Ls;
    accelerator_forNB(o,num,Merger::Nsimd,{
 	int  s=o%Ls;
 	int hxyz=o/Ls; // xyzt related component
 	int xyz0=hxyz*2;
 	int xyz1=hxyz*2+1;
 	int sp = (s+depth)%Ls; 
 	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int oo0= s+xyz0*Ls;
 	int oo1= s+xyz1*Ls;
 	// same ss0, ss1 pair goes to new layout
 	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
      });
  }
 };
 class FaceGatherDWFMixedBCs
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else 
  static int PartialCompressionFactor(GridBase *grid) {return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
 					 int off,int so,int partial)
  {
    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
  }
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    int partial = mm.partial;
    //    std::cout << " merge DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
    else           FaceGatherSimple::MergeFace(decompress,mm);
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    int partial = dd.partial;
    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
    else           FaceGatherSimple::DecompressFace(decompress,dd);
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -39,7 +242,8 @@ NAMESPACE_BEGIN(Grid);
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate : public FaceGatherSimple
+class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
 //  : public FaceGatherSimple
 {
 public:
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -165,12 +165,6 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -204,14 +204,7 @@ public:
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
-
+    
  void SloppyComms(int sloppy)
  {
    Stencil.SetSloppyComms(sloppy);
    StencilEven.SetSloppyComms(sloppy);
    StencilOdd.SetSloppyComms(sloppy);
  }
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@@ -57,7 +57,7 @@ public:
  {
    //    RealD eps = 1.0;
-    //    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
    std::vector<Coeff_t> zgamma(this->Ls);
    for(int s=0;s<this->Ls;s++){
      zgamma[s] = gamma[s];
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 {
    autoView(U_v  ,  U,AcceleratorRead);
    autoView(in_v , in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWriteDiscard);
+    autoView(out_v,out,AcceleratorWrite);
    autoView(st_v , st,AcceleratorRead);
    KERNEL_CALL_ID(GenericDhopSite);
 }
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -118,7 +118,7 @@ protected:
                             GaugeK);  // derivative of SmearBase
    return SigmaK;
  }
-public:
+
  /*! @brief Returns smeared configuration at level 'Level' */
  const GaugeField &get_smeared_conf(int Level) const
  {
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -819,7 +819,7 @@ public:
    }  // if smearingLevels = 0 do nothing
  }
-public:
+private:
  //====================================================================
  // Override base clas here to mask it
  virtual void fill_smearedSet(GaugeField &U)
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -252,11 +252,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  out = in;
  RealD taus = 0.;
  // Perform initial t=0 measurements
  for(auto const &meas : this->functions)
    meas.second(0,taus,out);
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@@ -341,11 +336,6 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
  // Perform initial t=0 measurements
  for(auto const &meas : this->functions)
    meas.second(step,taus,out);
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@@ -30,26 +30,25 @@
 NAMESPACE_BEGIN(Grid);
 uint64_t DslashFullCount;
-//uint64_t DslashPartialCount;
+uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;
 void DslashResetCounts(void)
 {
  DslashFullCount=0;
-  //  DslashPartialCount=0;
+  DslashPartialCount=0;
  DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
  dirichlet = DslashDirichletCount;
-  partial   = 0;
+  partial   = DslashPartialCount;
  full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-//void DslashLogPartial(void)  { DslashPartialCount++;}
+void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}
 deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  //  int  partialDirichlet;
+  int  partialDirichlet;
  DefaultImplParams()  {
    dirichlet.resize(0);
-    //    partialDirichlet=0;
+    partialDirichlet=0;
  };
 };
@@ -69,12 +69,6 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);
 class StencilBuffer
 {
 public:
  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
 };
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -119,8 +113,8 @@ class CartesianStencilAccelerator {
  ///////////////////////////////////////////////////
  // If true, this is partially communicated per face
  ///////////////////////////////////////////////////
-  //  StencilVector _comms_partial_send; 
+  StencilVector _comms_partial_send; 
-  //  StencilVector _comms_partial_recv;
+  StencilVector _comms_partial_recv;
  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
@@ -192,11 +186,6 @@ public:
  void ViewClose(void)  {  }
 #ifdef GRID_LOG_VIEWS
  size_t size() { return 0; };
  uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
 #endif
 };
 ////////////////////////////////////////
@@ -216,16 +205,16 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
-    void * compressed_send_buf;
+#ifndef ACCELERATOR_AWARE_MPI
-    void * compressed_recv_buf;
+    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
 #endif
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
    Integer xbytes;
    Integer rbytes;
    Integer xbytes_compressed;
    Integer rbytes_compressed;
  };
  struct Merge {
    static constexpr int Nsimd = vobj::Nsimd();
@@ -234,7 +223,7 @@ public:
    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
-    //    Integer partial; // partial dirichlet BCs
+    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct Decompress {
@@ -242,7 +231,7 @@ public:
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
-    //    Integer partial; // partial dirichlet BCs
+    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct CopyReceiveBuffer {
@@ -263,45 +252,9 @@ public:
 protected:
  GridBase *                        _grid;
  ///////////////////////////////////////////////////
  // Sloppy comms will make a second buffer upon comms
  ///////////////////////////////////////////////////
  size_t device_heap_top;  //
  size_t device_heap_bytes;//
  size_t device_heap_size; //
  void *DeviceBufferMalloc(size_t bytes)
  {
    void *ptr = (void *)device_heap_top;
    device_heap_top  += bytes;
    device_heap_bytes+= bytes;
    if ( device_heap_bytes > device_heap_size ) {
      std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
      assert (device_heap_bytes <= device_heap_size);
    }
    return ptr;
  }
  void  DeviceBufferFreeAll(void)
  {
    device_heap_size = _unified_buffer_size*sizeof(cobj);
    // Resize up if necessary, never down
    if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
      StencilBuffer::DeviceCommBuf.resize(device_heap_size);
    }
    device_heap_top  =(size_t) &StencilBuffer::DeviceCommBuf[0];
    device_heap_size = StencilBuffer::DeviceCommBuf.size();
    device_heap_bytes=0;
  }
 public:
  GridBase *Grid(void) const { return _grid; }
  /////////////////////////////////////////////////////////
  // Control reduced precision comms
  /////////////////////////////////////////////////////////
  int SloppyComms;
  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
  // without adding parameters. Perhaps a template parameter to StenciView is
@@ -315,7 +268,7 @@ public:
  }
  int face_table_computed;
-  //  int partialDirichlet;
+  int partialDirichlet;
  int fullDirichlet;
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;
@@ -408,145 +361,24 @@ public:
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
  void DecompressPacket(Packet &packet)
  {
    if ( !SloppyComms ) return;
    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
      typedef typename getPrecision<cobj>::real_scalar_type word;
      uint64_t words = packet.rbytes/sizeof(word);
      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
      const uint64_t outer = words/nsimd;
      if(sizeof(word)==8) {
 	// Can either choose to represent as float vs double and prec change
 	// OR
 	// truncate the mantissa bfp16 style
 	double *dbuf =(double *) packet.recv_buf;
 	float  *fbuf =(float  *) packet.compressed_recv_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
 	});
      } else if ( sizeof(word)==4){
 	// Can either choose to represent as half vs float and prec change
        // OR
 	// truncate the mantissa bfp16 style
 	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
 	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
 	});
      } else {
 	assert(0 && "unknown floating point precision");
      }
    }
  }
  void CompressPacket(Packet &packet)
  {
    packet.xbytes_compressed = packet.xbytes;
    packet.compressed_send_buf = packet.send_buf;
    packet.rbytes_compressed = packet.rbytes;
    packet.compressed_recv_buf = packet.recv_buf;
    if ( !SloppyComms  ) {
      return;
    }
    typedef typename getPrecision<cobj>::real_scalar_type word;
    uint64_t words = packet.xbytes/sizeof(word);
    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
    const uint64_t outer = words/nsimd;
    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
      packet.rbytes_compressed = packet.rbytes/2;
      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
    }
    //else {
    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
    //    }
    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
      packet.xbytes_compressed = packet.xbytes/2;
      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
      if(sizeof(word)==8) {
 	double *dbuf =(double *) packet.send_buf;
 	float  *fbuf =(float  *) packet.compressed_send_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
 	});
      } else if ( sizeof(word)==4){
 	uint32_t *fbuf =(uint32_t *) packet.send_buf;
 	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
 	accelerator_forNB(ss,outer,nsimd,{
 	  int lane = acceleratorSIMTlane(nsimd);
 	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
 	});
      } else {
 	assert(0 && "unknown floating point precision");
      }
    }
    //    else {
    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
    //    }
    return;
  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Begin "<<std::endl;
    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    ///////////////////////////////////////////////
    // All GPU kernel tasks must complete
-    //    accelerator_barrier();      All kernels should ALREADY be complete
+    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    //Everyone is here, so noone running slow and still using receive buffer
+    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
-    _grid->StencilBarrier();
+                               // But the HaloGather had a barrier too.
    // But the HaloGather had a barrier too.
    ///////////////////////////////////////////////
    if (SloppyComms) {
      DeviceBufferFreeAll();
    }
    for(int i=0;i<Packets.size();i++){
      this->CompressPacket(Packets[i]);
    }
    if (SloppyComms) { 
      accelerator_barrier();
 #ifdef NVLINK_GET
      _grid->StencilBarrier(); 
 #endif
    }
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].compressed_send_buf,
+					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].compressed_recv_buf,
+					  Packets[i].recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
@@ -557,22 +389,18 @@ public:
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,Packets[i].compressed_send_buf,
+					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
+					Packets[i].recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+					Packets[i].xbytes,Packets[i].rbytes,i);
      //      std::cout << "Communicate Begin started "<<i<<std::endl;
      //      _grid->Barrier();
    }
    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
+	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
    }
  }
@@ -587,15 +415,14 @@ public:
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if   ( this->partialDirichlet ) DslashLogPartial();
-    if ( this->fullDirichlet ) DslashLogDirichlet();
+    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    for(int i=0;i<Packets.size();i++){
      this->DecompressPacket(Packets[i]);
      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
    }
    FlightRecorder::StepLog("Finish communicate complete");
  }
@@ -790,7 +617,7 @@ public:
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
-    //    d.partial  = this->partialDirichlet;
+    d.partial  = this->partialDirichlet;
    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
@@ -799,7 +626,7 @@ public:
  }
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
-    //    m.partial  = this->partialDirichlet;
+    m.partial  = this->partialDirichlet;
    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
@@ -904,8 +731,8 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      //      this->_comms_partial_send[ii] = 0;
+      this->_comms_partial_send[ii] = 0;
-      //      this->_comms_partial_recv[ii] = 0;
+      this->_comms_partial_recv[ii] = 0;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -926,10 +753,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	//	if ( partialDirichlet ) {
+	if ( partialDirichlet ) {
-	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	//	}
+	}
      }
    }
  }
@@ -941,7 +768,6 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
    SloppyComms = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -959,7 +785,7 @@ public:
    this->same_node.resize(npoints);
    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    //    partialDirichlet = p.partialDirichlet;
+    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
    fullDirichlet=0;
    for(int d=0;d<p.dirichlet.size();d++){
@@ -1040,7 +866,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();
-    // Allow for multiple stencils to be communicated simultaneously
+    // Allow for multiple stencils to exist simultaneously
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();
@@ -1108,8 +934,7 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();
-    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@@ -1298,8 +1123,8 @@ public:
    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    int comms_partial_send   = this->_comms_partial_send[point] ;
-    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@@ -1334,11 +1159,11 @@ public:
 	int rbytes;
 	if ( comms_send ) xbytes = bytes; // Full send
-	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet
 	if ( comms_recv ) rbytes = bytes;
-	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1365,8 +1190,7 @@ public:
 	}
-	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
 	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1400,8 +1224,7 @@ public:
 #endif
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1410,8 +1233,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send) && (!shm_send );
+	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send) && (!shm_recv );
+	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1419,7 +1242,7 @@ public:
 		    xbytes,rbytes);
 	}
-	if ( (compress.DecompressionStep() && comms_recv) ) {
+	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1441,8 +1264,8 @@ public:
    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    int comms_partial_send   = this->_comms_partial_send[point] ;
-    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@@ -1517,20 +1340,18 @@ public:
 	if ( comms_send ) xbytes = bytes;
-	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;
 	if ( comms_recv ) rbytes = bytes;
-	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
 	if ( comms_send || comms_recv ) {
-	  //	  int partial = partialDirichlet;
+	  int partial = partialDirichlet;
 	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1596,8 +1417,7 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1611,8 +1431,7 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv|comms_partial_recv ) {
 	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -67,7 +67,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
-	GPU_PROP_FMT(totalGlobalMem,"%zu");
+	GPU_PROP_FMT(totalGlobalMem,"%lld");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
 	GPU_PROP(warpSize);
@@ -240,7 +240,7 @@ void acceleratorInit(void)
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
-  if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
+  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
  auto devices = sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
  auto err = cudaMallocHost((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
@@ -251,7 +251,7 @@ inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { c
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
-  acceleratorCopyToDevice(from,to,bytes);
+  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
@@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[sycl::reqd_sub_group_size(16)]]			\
+		     [[intel::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 
 #ifndef MIN
 #define MIN(x,y) ((x)>(y)?(y):(x))
 #endif
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -372,53 +372,4 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
  }
 }
 #ifdef GRID_LOG_VIEWS
 bool ViewLogger::Enabled = false;
 std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
 void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
 void ViewLogger::End() { Enabled = false; }
 void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
 {
  if (!Enabled)
   return;
  size_t i = LogVector.size();
  LogVector.resize(i + 1);
  auto & n = LogVector[i];
  n.filename = filename;
  n.line = line;
  n.index = index;
  if (bytes < sizeof(uint64_t)) {
    n.head = n.tail = 0;
  } else {
    switch (mode) {
    case AcceleratorRead:
    case AcceleratorWrite:
    case AcceleratorWriteDiscard:
      acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t));
      acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t));
      break;
    case CpuRead:
    case CpuWrite:
      //case CpuWriteDiscard:
      n.head = *(uint64_t*)data;
      n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t));
      break;
    }
  }
 }
 #endif
 NAMESPACE_END(Grid);
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@@ -42,22 +42,5 @@ class FlightRecorder {
  static void xmitLog(void *,uint64_t bytes);
  static void recvLog(void *,uint64_t bytes,int rank);
 };
 #ifdef GRID_LOG_VIEWS
 class ViewLogger {
  struct Entry_t {
    const char* filename;
    int line;
    int index;
    uint64_t head, tail;
  };
 public:
  static bool Enabled;
  static std::vector<Entry_t> LogVector;
  static void Begin();
  static void End();
  static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
 };
 #endif
 NAMESPACE_END(Grid);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -46,14 +46,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <cstdlib>
 #include <memory>
 #include <Grid/Grid.h>
 #include <Grid/util/CompilerCompatible.h>
 #ifdef HAVE_UNWIND
 #include <libunwind.h>
 #endif
 #include <fenv.h>
 #ifdef __APPLE__
@@ -299,20 +295,6 @@ void GridBanner(void)
    std::cout << std::setprecision(9);
 }
 //Some file local variables
 static int fileno_stdout;
 static int fileno_stderr;
 static int signal_delay;
 class dlRegion {
 public:
  uint64_t start;
  uint64_t end;
  uint64_t size;
  uint64_t offset;
  std::string name;
 };
 std::vector<dlRegion> dlMap;
 void Grid_init(int *argc,char ***argv)
 {
@@ -365,19 +347,6 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  // Sleep n-seconds at end of handler
  if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
    GridCmdOptionInt(arg,signal_delay);
  }
  // periodic wakeup with stack trace printed
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
    Grid_debug_heartbeat();
  }
  // periodic wakeup with empty handler (interrupts some system calls)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
    Grid_heartbeat();
  }
 #if defined(A64FX)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
@@ -416,52 +385,26 @@ void Grid_init(int *argc,char ***argv)
  } else { 
    FILE *fp;
    std::ostringstream fname;
    int rank = CartesianCommunicator::RankWorld();
    int radix=64;
    char* root = getenv("GRID_STDOUT_ROOT");
    if (root) {
      fname << root ;
      mkdir(fname.str().c_str(), S_IRWXU );
      fname << "/";
    }
    fname << (rank/radix)*radix ;
    mkdir(fname.str().c_str(), S_IRWXU );
    fname << "/";
    fname<<"Grid.stdout.";
    fname<<CartesianCommunicator::RankWorld();
    fp=freopen(fname.str().c_str(),"w",stdout);
    assert(fp!=(FILE *)NULL);
    std::ostringstream ename;
    if (root){
      ename << root << "/";
    }
    ename << (rank/radix)*radix << "/";
    ename<<"Grid.stderr.";
    ename<<CartesianCommunicator::RankWorld();
    fp=freopen(ename.str().c_str(),"w",stderr);
    assert(fp!=(FILE *)NULL);
  }
  fileno_stdout = fileno(stdout);
  fileno_stderr = fileno(stderr) ;
  ////////////////////////////////////////////////////
  // OK to use GridLogMessage etc from here on
  ////////////////////////////////////////////////////
  std::cout << GridLogMessage << "================================================ "<<std::endl;
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;
-  {
+
-    gethostname(hostname, HOST_NAME_MAX+1);
+  gethostname(hostname, HOST_NAME_MAX+1);
-    time_t mytime;
+  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
    struct tm *info;
    char buffer[80];
    time(&mytime);
    info = localtime(&mytime);
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
    std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
  }
  /////////////////////////////////////////////////////////
  // Reporting
@@ -478,47 +421,6 @@ void Grid_init(int *argc,char ***argv)
    MemoryProfiler::stats = &dbgMemStats;
  }
  /////////////////////////////////////////////////////////
  // LD.so space
  /////////////////////////////////////////////////////////
 #ifndef __APPLE__
  {
    // Provides mapping of .so files 
    FILE *f = fopen("/proc/self/maps", "r");
    if (f) {
      char line[256];
      while (fgets(line, sizeof(line), f)) {
 	if (strstr(line, "r-xp")) {
 	  dlRegion region;
 	  uint32_t major, minor, inode;
 	  uint64_t start,end,offset;
 	  char path[PATH_MAX];
 	  sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
 		 &start,&end,&offset,
 		 &major,&minor,&inode,path);
 	  region.start=start;
 	  region.end  =end;
 	  region.offset=offset;
 	  region.name = std::string(path);
 	  region.size = region.end-region.start;
 	  dlMap.push_back(region);
 	  //	  std::cout << GridLogMessage<< line;
 	}
      }
      fclose(f);
    }
    if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
      std::cout << GridLogMessage << "================================================ "<<std::endl;
      std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl; 
      std::cout << GridLogMessage << "================================================ "<<std::endl;
      for(int r=0;r<dlMap.size();r++){
 	auto region = dlMap[r];
 	std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
      }
      std::cout << GridLogMessage << "================================================ "<<std::endl;
    }
  }
 #endif
  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
@@ -551,19 +453,14 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
    std::cout<<GridLogMessage<<"  --device-mem M  : Size of device software cache for lattice fields (MB) "<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
-    std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
+    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --log list      : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-    std::cout<<GridLogMessage<<"Debug:"<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;
    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-mem     : print Grid allocator activity"<<std::endl;
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@@ -658,56 +555,17 @@ void GridLogLayout() {
 }
 void * Grid_backtrace_buffer[_NBACKTRACE];
 #define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
-void sig_print_dig(uint32_t dig)
+void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-  const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
+  fprintf(stderr,"Signal handler on host %s\n",hostname);
-  if ( dig>=0 && dig< 16){
+  fprintf(stderr,"FlightRecorder step %d stage %s \n",
-    SIGLOG(digits[dig]);
+	  FlightRecorder::StepLoggingCounter,
-  }
+	  FlightRecorder::StepName);
-}
+  fprintf(stderr,"Caught signal %d\n",si->si_signo);
-void sig_print_uint(uint32_t A)
+  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
-{
+  fprintf(stderr,"         code %d\n",si->si_code);
-  int dig;
+  // x86 64bit
  int nz=0;
 #define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
  DIGIT(1000000000); // Catches 4BN = 2^32
  DIGIT(100000000);
  DIGIT(10000000);
  DIGIT(1000000);
  DIGIT(100000);
  DIGIT(10000);
  DIGIT(1000);
  DIGIT(100);
  DIGIT(10);
  DIGIT(1);
  if (nz==0) SIGLOG("0");
 }
 void sig_print_hex(uint64_t A)
 {
  int nz=0;
  int dig;
 #define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;  
  SIGLOG("0x");
  NIBBLE((A>>(15*4))&0xF);
  NIBBLE((A>>(14*4))&0xF);
  NIBBLE((A>>(13*4))&0xF);
  NIBBLE((A>>(12*4))&0xF);
  NIBBLE((A>>(11*4))&0xF);
  NIBBLE((A>>(10*4))&0xF);
  NIBBLE((A>>(9*4))&0xF);
  NIBBLE((A>>(8*4))&0xF);
  NIBBLE((A>>(7*4))&0xF);
  NIBBLE((A>>(6*4))&0xF);
  NIBBLE((A>>(5*4))&0xF);
  NIBBLE((A>>(4*4))&0xF);
  NIBBLE((A>>(3*4))&0xF);
  NIBBLE((A>>(2*4))&0xF);
  NIBBLE((A>>4)&0xF);
  sig_print_dig(A&0xF);
 }
 /*
 #ifdef __linux__
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
@@ -715,158 +573,81 @@ void sig_print_hex(uint64_t A)
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 #endif
 #endif
-*/
+  fflush(stderr);
-void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
+  BACKTRACEFP(stderr);
-{
+  fprintf(stderr,"Called backtrace\n");
-  SIGLOG("Signal handler on host ");
+  fflush(stdout);
-  SIGLOG(hostname);
+  fflush(stderr);
  SIGLOG(" process id ");
  sig_print_uint((uint32_t)getpid());
  SIGLOG("\n");
  SIGLOG("FlightRecorder step ");
  sig_print_uint(FlightRecorder::StepLoggingCounter);
  SIGLOG(" stage ");
  SIGLOG(FlightRecorder::StepName);
  SIGLOG("\n");
  SIGLOG("Caught signal ");
  sig_print_uint(si->si_signo);
  SIGLOG("\n");
  SIGLOG("  mem address ");
  sig_print_hex((uint64_t)si->si_addr);
  SIGLOG("\n");
  SIGLOG("  code ");
  sig_print_uint(si->si_code);
  SIGLOG("\n");
  ucontext_t *uc= (ucontext_t *)ptr;
  SIGLOG("Backtrace:\n");
 #ifdef HAVE_UNWIND
  // Debug cross check on offsets
  //  int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
  //  backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
  unw_cursor_t cursor;
  unw_word_t ip, off;
  if (!unw_init_local(&cursor, uc) ) {
    SIGLOG("   frame     IP       function\n");
    int level = 0;
    int ret = 0;
    while(1) {
      char name[128];
      if (level >= _NBACKTRACE) return;
      unw_get_reg(&cursor, UNW_REG_IP, &ip);
      sig_print_uint(level); SIGLOG(" ");
      sig_print_hex(ip);     SIGLOG(" ");
      for(int r=0;r<dlMap.size();r++){
 	if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
 	  SIGLOG(dlMap[r].name.c_str());
 	  SIGLOG("+");
 	  sig_print_hex((ip-dlMap[r].start));
 	  break;
 	}
      }
      SIGLOG("\n");
      Grid_backtrace_buffer[level]=(void *)ip;
      level++;
      ret = unw_step(&cursor);
      if (ret <= 0) {
 	return;
      }
    }
  }
 #else
  // Known Asynch-Signal unsafe
  int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
  backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
 #endif
 }
 void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  Grid_generic_handler(sig,si,ptr);
  SIGLOG("\n");
 }
 void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  Grid_generic_handler(sig,si,ptr);
  if (signal_delay) {
    SIGLOG("Adding extra signal delay ");
    sig_print_uint(signal_delay);
    SIGLOG(" s\n");
    usleep( (uint64_t) signal_delay*1000LL*1000LL);
  }
  SIGLOG("\n");
  return;
 }
-void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
+void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-  Grid_generic_handler(sig,si,ptr);
+  fprintf(stderr,"Signal handler on host %s\n",hostname);
-  SIGLOG("\n");
+  fprintf(stderr,"Caught signal %d\n",si->si_signo);
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
  // Linux/Posix
 #ifdef __linux__
  // And x86 64bit
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
  REG(rbx);
  REG(rdx);
  REG(rax);
  REG(rcx);
  REG(rsp);
  REG(rip);
  REG(r8);
  REG(r9);
  REG(r10);
  REG(r11);
  REG(r12);
  REG(r13);
  REG(r14);
  REG(r15);
 #endif
 #endif
  fflush(stderr);
  BACKTRACEFP(stderr);
  fprintf(stderr,"Called backtrace\n");
  fflush(stdout);
  fflush(stderr);
  exit(0);
  return;
 };
 void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  //  SIGLOG("heartbeat signal handled\n");
  return;
 }
 void Grid_debug_heartbeat(void)
 {
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
  sa_ping.sa_flags    = SA_SIGINFO;
  sigaction(SIGALRM,&sa_ping,NULL);
  // repeating 10s heartbeat
  struct itimerval it_val;
  it_val.it_value.tv_sec = 10;
  it_val.it_value.tv_usec = 0;
  it_val.it_interval = it_val.it_value;
  setitimer(ITIMER_REAL, &it_val, NULL);
 }
 void Grid_heartbeat(void)
 {
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_empty_signal_handler;
  sa_ping.sa_flags    = SA_SIGINFO;
  sigaction(SIGALRM,&sa_ping,NULL);
  // repeating 10s heartbeat
  struct itimerval it_val;
  it_val.it_value.tv_sec = 10;
  it_val.it_value.tv_usec = 1000;
  it_val.it_interval = it_val.it_value;
  setitimer(ITIMER_REAL, &it_val, NULL);
 }
 void Grid_exit_handler(void)
 {
-  BACKTRACEFP(stdout);
+  //  BACKTRACEFP(stdout);
-  fflush(stdout);
+  //  fflush(stdout);
 }
 void Grid_debug_handler_init(void)
 {
  struct sigaction sa;
  sigemptyset (&sa.sa_mask);
-  sa.sa_sigaction= Grid_fatal_signal_handler;
+  sa.sa_sigaction= Grid_sa_signal_handler;
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
 #ifndef GRID_SYCL
  sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
  sigaction(SIGBUS,&sa,NULL);
-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  //  sigaction(SIGUSR2,&sa,NULL);
  sigaction(SIGFPE,&sa,NULL);
 #endif
-  // Non terminating SIGHUP handler
+  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
  // Non terminating SIGUSR1/2 handler
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -38,11 +38,7 @@ char * GridHostname(void);
 // internal, controled with --handle
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_debug_handler_init(void);
 void Grid_debug_heartbeat(void);
 void Grid_heartbeat(void);
 void Grid_quiesce_nodes(void);
 void Grid_unquiesce_nodes(void);
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -66,7 +66,6 @@ namespace Grid{
  };
 }
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -74,7 +73,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0); // Lexico
+  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@@ -108,18 +107,8 @@ int main(int argc, char **argv) {
  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
 #if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
 #else
  // Don't require Grid format RNGs
  FieldMetaData header;
  std::string file, filesmr;
  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);
  NerscIO::readConfiguration(Umu,header,file);
 #endif
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
@@ -127,7 +116,6 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -177,48 +165,33 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;
    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
    int tau = std::round(t);
    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    //    writeFile(R,efile);
+    writeFile(R,efile);
    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    //    writeFile(qfield,tfile);
+    writeFile(qfield,tfile);
    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
    {
      //      PeriodicGimplR::GaugeField Ucopy = U;
      //      NerscIO::writeConfiguration(Ucopy,ufile);
    }
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
  });
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-  //  NerscIO::writeConfiguration(Uflow,filesmr);
+
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;
  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -201,7 +201,8 @@ int main(int argc, char **argv) {
  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
-  //  ParamsDir.partialDirichlet=0;
+  ParamsDir.partialDirichlet=0;
  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -297,11 +298,11 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
-    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    //    else                      ParamsNum.partialDirichlet = 0;
+    else                      ParamsNum.partialDirichlet = 0;
-    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    //    else                      ParamsDen.partialDirichlet = 0;
+    else                      ParamsDen.partialDirichlet = 0;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
  ParamsF.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDirF.dirichlet=Dirichlet;
-  //  ParamsDir.partialDirichlet=1;
+  ParamsDir.partialDirichlet=1;
-  //  ParamsDirF.partialDirichlet=1;
+  ParamsDirF.partialDirichlet=1;
-  //  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
-    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    //    else                      ParamsNum.partialDirichlet = 0;
+    else                      ParamsNum.partialDirichlet = 0;
-    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    //    else                      ParamsDen.partialDirichlet = 0;
+    else                      ParamsDen.partialDirichlet = 0;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    //    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
+    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    //    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
+    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
  }  
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      Coordinate latt_size  ({lat*mpi_layout[0],
-    	                      lat*mpi_layout[1],
+	                      lat*mpi_layout[1],
-                              lat*mpi_layout[2],
+      			      lat*mpi_layout[2],
-	                      lat*mpi_layout[3]});
+      			      lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
@@ -193,6 +193,101 @@ int main (int argc, char ** argv)
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	dbytes=0;
 	ncomm=0;
 	std::vector<CommsRequest_t> requests;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu][0],
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,1,
 					      bytes,bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,1,
 					      bytes,bytes,mu+4);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      //      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      Coordinate latt_size  ({lat*mpi_layout[0],
      			      lat*mpi_layout[1],
      			      lat*mpi_layout[2],
      			      lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
@@ -201,35 +296,45 @@ int main (int argc, char ** argv)
 	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
-
+	for(int mu=0;mu<4;mu++){
-	for(int dir=0;dir<8;dir++) {
+	
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    if ( dir == mu ) { 
+	    
-	      int comm_proc=1;
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    dbytes+=
-	    } else { 
+	      Grid.StencilSendToRecvFromBegin(requests,
-	      int comm_proc = mpi_layout[mu]-1;
+					      (void *)&xbuf[mu][0],
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+					      xmit_to_rank,1,
-	    }
+					      (void *)&rbuf[mu][0],
-	    //            int tid = omp_get_thread_num();
+					      recv_from_rank,1,
-	    int tid = 0;
+					      bytes,bytes,mu);
-	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
+	    Grid.StencilSendToRecvFromComplete(requests,mu);
-					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
+	    requests.resize(0);
-	    dbytes+=tbytes;
+	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
 					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,1,
 					      bytes,bytes,mu+4);
 	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
-        }
+	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -32,18 +32,18 @@
 using namespace std;
 using namespace Grid;
-////////////////////////
+template<class d>
-/// Move to domains ////
+struct scal {
-////////////////////////
+  d internal;
 Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
 };
-void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
+  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 int main (int argc, char ** argv)
 {
@@ -52,108 +52,39 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
-  int Ls=16;
+  Coordinate latt4 = GridDefaultLatt();
-  for(int i=0;i<argc;i++) {
+  int Ls=8;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  }
  //////////////////
  // With comms
  //////////////////
  Coordinate Dirichlet(Nd+1,0);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  Benchmark(Ls,Dirichlet,false);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  Benchmark(Ls,Dirichlet,true);
  //////////////////
  // Domain decomposed
  //////////////////
  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  Dirichlet[0] = 0;
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  Benchmark(Ls,Dirichlet,false);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
  Benchmark(Ls,Dirichlet,true);
  */
  Grid_finalize();
  exit(0);
 }
 void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
-  std::vector<int> seeds4({1,2,3,4});
+
-  std::vector<int> seeds5({5,6,7,8});
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 #undef SINGLE
 #ifdef SINGLE
  typedef vComplexF          Simd;
  typedef LatticeFermionF    FermionField;
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
 #else
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
- 
+  LatticeFermion src   (FGrid); random(RNG5,src);
  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
@@ -169,39 +100,46 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  src = src*N2;
 #endif
-  FermionField result(FGrid); result=Zero();
+
-  FermionField    ref(FGrid);    ref=Zero();
+  LatticeFermion result(FGrid); result=Zero();
-  FermionField    tmp(FGrid);
+  LatticeFermion    ref(FGrid);    ref=Zero();
-  FermionField    err(FGrid);
+  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  GaugeField Umu(UGrid);
+  LatticeGaugeField Umu(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  SU<Nc>::ColdConfiguration(Umu);
  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
  for(int mu=0;mu<Nd;mu++){
    LatticeColourMatrix ttmp(UGrid);
    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
    //    if (mu !=2 ) ttmp = 0;
    //    ttmp = ttmp* pow(10.0,mu);
    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
  }
  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
 #endif
  ////////////////////////////////////
  // Apply BCs
  ////////////////////////////////////
  Coordinate Block(4);
  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
  DirichletFilter<GaugeField> Filter(Block);
  Filter.applyFilter(Umu);
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
-  std::vector<ColourMatrixField> U(4,UGrid);
+  // replicate across fifth dimension
-  for(int mu=0;mu<Nd;mu++){
+  LatticeGaugeField Umu5d(FGrid);
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  std::vector<LatticeColourMatrix> U(4,FGrid);
  {
    autoView( Umu5d_v, Umu5d, CpuWrite);
    autoView( Umu_v  , Umu  , CpuRead);
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
      for(int s=0;s<Ls;s++){
 	Umu5d_v[Ls*ss+s] = Umu_v[ss];
      }
    }
  }
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
@@ -209,28 +147,10 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
-      tmp = Cshift(src,mu+1,1);
+      tmp = U[mu]*Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      {
+      tmp =adj(U[mu])*src;
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -247,9 +167,11 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -259,15 +181,9 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  FermionAction::ImplParams p;
+  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  p.dirichlet=Dirichlet;
+  int ncall =1000;
-  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+
  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  int ncall =300;
  RealD n2e;
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
@@ -282,8 +198,8 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;
-    auto nsimd = Simd::Nsimd();
+    auto nsimd = vComplex::Nsimd();
-    auto simdwidth = sizeof(Simd);
+    auto simdwidth = sizeof(vComplex);
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -292,27 +208,28 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
    err = ref-result;
-    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+    //exit(0);
-    if(( n2e>1.0e-4) ) {
+    if(( norm2(err)>1.0e-4) ) {
      /*
      std::cout << "RESULT\n " << result<<std::endl;
      std::cout << "REF   \n " << ref   <<std::endl;
      std::cout << "ERR   \n " << err   <<std::endl;
      */
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      std::cout<<GridLogMessage << "RESULT" << std::endl;
      //      std::cout << result<<std::endl;
      std::cout << norm2(result)<<std::endl;
      std::cout<<GridLogMessage << "REF" << std::endl;
      std::cout << norm2(ref)<<std::endl;
      std::cout<<GridLogMessage << "ERR" << std::endl;
      std::cout << norm2(err)<<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
-    assert (n2e< 1.0e-4 );
+    assert (norm2(err)< 1.0e-4 );
  }
  if (1)
@@ -321,30 +238,16 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = Cshift(src,mu+1,1);
+      tmp = U[mu]*Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	autoView( U_v  , U[mu]  , CpuRead);
+	for(int i=0;i<ref_v.size();i++){
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
 	  for(int s=0;s<Ls;s++){
 	    int i=s+Ls*ss;
 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
 	  }
 	}
      }
-      
+
-      {
+      tmp =adj(U[mu])*src;
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
@@ -356,27 +259,27 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
    }
    ref = -0.5*ref;
  }
-
+  //  dump=1;
-  Dw.Dhop(src,result,DaggerYes);
+  Dw.Dhop(src,result,1);
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
-  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
-  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+  if((norm2(err)>1.0e-4)){
 /*
 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
 	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
 	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
 */
  }
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
  LatticeFermion r_o   (FrbGrid);
  LatticeFermion r_eo  (FGrid);
  assert((n2e)<1.0e-4);
  FermionField src_e (FrbGrid);
  FermionField src_o (FrbGrid);
  FermionField r_e   (FrbGrid);
  FermionField r_o   (FrbGrid);
  FermionField r_eo  (FGrid);
  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
@@ -388,8 +291,10 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -403,7 +308,13 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
 #ifdef CUDA_PROFILE
      if(i==10) cudaProfilerStart();
 #endif
      Dw.DhopEO(src_o,r_e,DaggerNo);
 #ifdef CUDA_PROFILE
      if(i==20) cudaProfilerStop();
 #endif
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -427,9 +338,14 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  setCheckerboard(r_eo,r_e);
  err = r_eo-result;
-  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  if((norm2(err)>1.0e-4)){
-  assert(n2e<1.0e-4);
+    /*
 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
 	std::cout<< "Deo REF\n " <<result  << std::endl;
 	std::cout<< "Deo ERR   \n " << err <<std::endl;
    */
  }
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
@@ -438,4 +354,6 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaT
 };
-void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
+void Benchmark(int Ls, Coordinate Dirichlet);
 int main (int argc, char ** argv)
 {
@@ -69,19 +69,11 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  Benchmark(Ls,Dirichlet,false);
+  Benchmark(Ls,Dirichlet);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  Benchmark(Ls,Dirichlet,true);
  //////////////////
  // Domain decomposed
  //////////////////
  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
@@ -89,35 +81,42 @@ int main (int argc, char ** argv)
  GlobalSharedMemory::GetShmDims(mpi,shm);
  //////////////////////
  // Node level
  //////////////////////
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  Dirichlet[0] = 0;
+  //  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
-  Benchmark(Ls,Dirichlet,false);
+  Benchmark(Ls,Dirichlet);
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
  //  Dirichlet[0] = 0;
  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
-  Benchmark(Ls,Dirichlet,true);
+  Benchmark(Ls,Dirichlet);
-  */
+
  Grid_finalize();
  exit(0);
 }
-void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
+void Benchmark(int Ls, Coordinate Dirichlet)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
@@ -133,13 +132,21 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
-#else
+#endif
 #ifdef DOUBLE
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
 #ifdef DOUBLE2
  typedef vComplexD2          Simd;
  typedef LatticeFermionD2    FermionField;
  typedef LatticeGaugeFieldD2 GaugeField;
  typedef LatticeColourMatrixD2 ColourMatrixField;
  typedef DomainWallFermionD2 FermionAction;
 #endif
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -262,7 +269,6 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  int ncall =300;
--- a/benchmarks/Benchmark_dwf_fp32_partial.cc
+++ b/benchmarks/Benchmark_dwf_fp32_partial.cc
@@ -0,0 +1,465 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
 #endif
 #ifdef CUDA_PROFILE
 #include <cuda_profiler_api.h>
 #endif
 using namespace std;
 using namespace Grid;
 ////////////////////////
 /// Move to domains ////
 ////////////////////////
 Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
 };
 void Benchmark(int Ls, Coordinate Dirichlet, int partial);
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  int Ls=8;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  }
  //////////////////
  // With comms
  //////////////////
  Coordinate Dirichlet(Nd+1,0);
  for(auto partial : {0}) {
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    Benchmark(Ls,Dirichlet,partial);
  }
  //////////////////
  // Domain decomposed
  //////////////////
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
  //Coordinate shm({2,1,1,1});
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
  //////////////////////
  // Node level
  //////////////////////
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  //  for(int d=0;d<Nd;d++) CommDim[d]= 1;
  Dirichlet[0] = 0;
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  for(auto partial : {0,1}) {
    std::cout << "\n\n\n\n\n\n" <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    Benchmark(Ls,Dirichlet,partial);
  }
  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
  Dirichlet[0] = 0;
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  for(auto partial : {0,1}) {
    std::cout << "\n\n\n\n\n\n" <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
    Benchmark(Ls,Dirichlet,partial);
  }
  Grid_finalize();
  exit(0);
 }
 void Benchmark(int Ls, Coordinate Dirichlet, int partial)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
 #define SINGLE
 #ifdef SINGLE
  typedef vComplexF          Simd;
  typedef LatticeFermionF    FermionField;
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
 #endif
 #ifdef DOUBLE
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
 #ifdef DOUBLE2
  typedef vComplexD2          Simd;
  typedef LatticeFermionD2    FermionField;
  typedef LatticeGaugeFieldD2 GaugeField;
  typedef LatticeColourMatrixD2 ColourMatrixField;
  typedef DomainWallFermionD2 FermionAction;
 #endif
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
    Coordinate origin({0,0,0,latt4[2]-1,0});
    SpinColourVectorF tmp;
    tmp=Zero();
    tmp()(0)(0)=Complex(-2.0,0.0);
    std::cout << " source site 0 " << tmp<<std::endl;
    pokeSite(tmp,src,origin);
  }
 #else
  RealD N2 = 1.0/::sqrt(norm2(src));
  src = src*N2;
 #endif
  FermionField result(FGrid); result=Zero();
  FermionField    ref(FGrid);    ref=Zero();
  FermionField    tmp(FGrid);
  FermionField    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  GaugeField Umu(UGrid);
  GaugeField UmuFull(UGrid);
  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
  UmuCopy=Umu;
  UmuFull=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
  ////////////////////////////////////
  // Apply BCs
  ////////////////////////////////////
  Coordinate Block(4);
  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
  DirichletFilter<GaugeField> Filter(Block);
  Filter.applyFilter(Umu);
  if(!partial) Filter.applyFilter(UmuCopy);
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<ColourMatrixField> U(4,UGrid);
  std::vector<ColourMatrixField> Ucopy(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
    Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
  {
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
      int depth=dwf_compressor_depth;
      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v    , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  RealD NN = UGrid->NodeCount();
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
  std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
  std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  p.partialDirichlet=partial;
  FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  int ncall =1;
  RealD n2e;
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;
    auto nsimd = Simd::Nsimd();
    auto simdwidth = sizeof(Simd);
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    err = ref-result;
    n2e = norm2(err);
    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      DumpSliceNorm("s-slice ref ",ref,1);
      DumpSliceNorm("s-slice res ",result,1);
      DumpSliceNorm("s-slice error ",err,1);
      exit(-1);
    }
    assert (n2e< 1.0e-4 );
  }
  if (1)
  { // Naive wilson dag implementation
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
      int depth=dwf_compressor_depth;
      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v    , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    if ( (s<depth) || (s>=Ls-depth)){
 	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
 	    } else {
 	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	    }
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,DaggerYes);
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
  n2e= norm2(err);
  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
  assert((n2e)<1.0e-4);
  FermionField src_e (FrbGrid);
  FermionField src_o (FrbGrid);
  FermionField r_e   (FrbGrid);
  FermionField r_o   (FrbGrid);
  FermionField r_eo  (FGrid);
  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
  Dw.Dhop  (src  ,result,DaggerNo);
  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);
  err = r_eo-result;
  n2e= norm2(err);
  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
  assert(n2e<1.0e-4);
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
 }
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -873,7 +873,7 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =1;
+  int do_blas  =0;
  int do_dslash=1;
  int sel=4;
--- a/configure.ac
+++ b/configure.ac
@@ -86,7 +86,6 @@ AC_ARG_WITH([gmp],
    [try this for a non-standard install prefix of the GMP library])],
    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
 AC_ARG_WITH([mpfr],
    [AS_HELP_STRING([--with-mpfr=prefix],
    [try this for a non-standard install prefix of the MPFR library])],
@@ -107,13 +106,6 @@ AC_ARG_WITH([lime],
            [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
 ############### LIBUNWIND
 AC_ARG_WITH([unwind],
            [AS_HELP_STRING([--with-unwind=prefix],
            [try this for a non-standard install prefix of the libunwind library])],
            [AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
 ############### OpenSSL
 AC_ARG_WITH([openssl],
            [AS_HELP_STRING([--with-openssl=prefix],
@@ -222,7 +214,7 @@ esac
 ############### Symplectic group
 AC_ARG_ENABLE([Sp],
-    [AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])],
+    [AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
    [ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])
 AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])
@@ -263,28 +255,6 @@ case ${ac_ACCELERATOR_AWARE_MPI} in
    *);;
 esac
 ############### CHECKSUM COMMS
 AC_ARG_ENABLE([checksum-comms],
    [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
 case ${ac_CHECKSUM_COMMS} in
    yes)
      AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);;
    *);;
 esac
 ############### LOG VIEWS
 AC_ARG_ENABLE([log-views],
    [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
 case ${ac_LOG_VIEWS} in
    yes)
      AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);;
    *);;
 esac
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
@@ -403,16 +373,6 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
               [have_lime=true],
 	             [AC_MSG_WARN(LIME library was not found in your system.)])
 AC_SEARCH_LIBS([unw_backtrace], [unwind],
               [AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
               [have_unwind=true],
 	             [AC_MSG_WARN(libunwind library was not found in your system.)])
 AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
               [AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
               [have_unwind_x86_64=true],
 	             [AC_MSG_WARN(libunwind library was not found in your system.)])
 AC_SEARCH_LIBS([SHA256_Init], [crypto],
               [AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
               [have_crypto=true],
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -1,8 +1,7 @@
 #!/bin/bash
 ##PBS -q EarlyAppAccess
 #PBS -q debug
 #PBS -l filesystems=flare
 #PBS -l filesystems=home
 #PBS -l select=2
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA
@@ -15,18 +14,26 @@ cp $PBS_NODEFILE nodefile
 export OMP_NUM_THREADS=4
 export MPICH_OFI_NIC_POLICY=GPU
-export MPICH_CH4_SHM=XPMEM
+
-export MPIR_CVAR_DEBUG_SUMMARY=1
+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-export MPICH_DBG_LEVEL=VERBOSE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
-export MPICH_DBG_CLASS=ALL
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #
 # Local vol 16.16.16.32
 #
 #VOL=32.64.64.96
 mpiexec -np 1 -ppn 1  -envall mpivars
-for VOL in 32.32.32.96 
+for VOL in 32.32.32.96 32.64.64.96
 do
 for AT in 32
 do
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,28 +1,26 @@
-export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
+#Ahead of time compile for PVC
 export GMP=`spack find --paths gmp   | grep ^gmp  | awk '{print $2}' `
 export CLIME=`spack find --paths c-lime  | grep ^c-lime  | awk '{print $2}' `
 export UNWIND=`spack find --paths libunwind  | grep ^libunwind  | awk '{print $2}' `
-../../configure \
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 ../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
 	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-checksum-comms=yes \
 	--enable-log-views=yes \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=no \
 	--enable-unified=no \
 	--with-lime=$CLIME \
-	--with-gmp=$GMP \
+	--enable-shm=nvlink \
-	--with-mpfr=$MPFR \
+	--enable-accelerator=sycl \
-	--with-unwind=$UNWIND \
+	--enable-accelerator-aware-mpi=no\
 	--enable-unified=no \
 	MPICXX=mpicxx \
-	CXX=icpx \
+	CXX=icpx 
 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc -lnuma" \
 	CXXFLAGS="-fPIC -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel"
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -1,13 +1,16 @@
 #module load oneapi/release/2023.12.15.001
 #module load mpich/icc-all-debug-pmix-gpu/52.2
 #module load mpich-config/mode/deterministic
 #module load intel_compute_runtime/release/821.35
 module load pti-gpu
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 spack load openssl
 export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 spack load openssl@3.3.1%gcc@12.2.0
 spack load unwind
 export UNWIND=`spack find --paths libunwind  | grep ^libunwind  | awk '{print $2}' `
 export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Jupiter/benchmarks/dwf.1node.perf
+++ b/systems/Jupiter/benchmarks/dwf.1node.perf
@@ -1,273 +0,0 @@
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 SLURM detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 1 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 0 device 0 bus id: 0009:01:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 4
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 81604378624 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
 Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.309000 s :  Testing with full communication 
 Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.313000 s : Grid Layout
 Grid : Message : 0.313000 s : 	Global lattice size  : 32 32 64 64 
 Grid : Message : 0.319000 s : 	OpenMP threads       : 4
 Grid : Message : 0.320000 s : 	MPI tasks            : 1 1 2 2 
 Grid : Message : 0.129590 s : Initialising 4d RNG
 Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 0.942440 s : Initialising 5d RNG
 Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 local rank 1 device 0 bus id: 0019:01:00.0
 local rank 2 device 0 bus id: 0029:01:00.0
 local rank 3 device 0 bus id: 0039:01:00.0
 Grid : Message : 43.893114 s : Drawing gauge field
 Grid : Message : 54.574150 s : Random gauge initialised 
 Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 54.580032 s : Setting up Cshift based reference 
 Grid : Message : 60.407451 s : *****************************************************************
 Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 60.407470 s : *****************************************************************
 Grid : Message : 60.407471 s : *****************************************************************
 Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 60.407473 s : * Vectorising space-time by 8
 Grid : Message : 60.407475 s : * VComplex size is 64 B
 Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
 Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 60.407480 s : *****************************************************************
 Grid : Message : 61.102178 s : Called warmup
 Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
 Grid : Message : 62.177198 s : mflop/s =   24721998.6
 Grid : Message : 62.177201 s : mflop/s per rank =  6180499.64
 Grid : Message : 62.177204 s : mflop/s per node =  24721998.6
 Grid : Message : 62.182696 s : norm diff   5.8108784e-14  Line 306
 Grid : Message : 71.328862 s : ----------------------------------------------------------------
 Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 71.328885 s : ----------------------------------------------------------------
 Grid : Message : 71.328886 s : Called DwDag
 Grid : Message : 71.328887 s : norm dag result 4.12810493
 Grid : Message : 71.329493 s : norm dag ref    4.12810493
 Grid : Message : 71.331967 s : norm dag diff   3.40632318e-14  Line 377
 Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 71.803650 s : src_e0.500003185
 Grid : Message : 71.819727 s : src_o0.499996882
 Grid : Message : 71.821991 s : *********************************************************
 Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 71.821995 s : * Vectorising space-time by 8
 Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
 Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 71.822003 s : *********************************************************
 Grid : Message : 72.377054 s : Deo mflop/s =   24065467
 Grid : Message : 72.377071 s : Deo mflop/s per rank   6016366.75
 Grid : Message : 72.377074 s : Deo mflop/s per node   24065467
 Grid : Message : 72.624877 s : r_e2.06377678
 Grid : Message : 72.625198 s : r_o2.06381058
 Grid : Message : 72.625507 s : res4.12758736
 Grid : Message : 73.759140 s : norm diff   0
 Grid : Message : 73.868204 s : norm diff even  0
 Grid : Message : 73.907201 s : norm diff odd   0
 Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 74.414582 s :  Testing without internode communication 
 Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 74.414586 s : Grid Layout
 Grid : Message : 74.414586 s : 	Global lattice size  : 32 32 64 64 
 Grid : Message : 74.414594 s : 	OpenMP threads       : 4
 Grid : Message : 74.414595 s : 	MPI tasks            : 1 1 2 2 
 Grid : Message : 74.679364 s : Initialising 4d RNG
 Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 74.759525 s : Initialising 5d RNG
 Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 119.252016 s : Drawing gauge field
 Grid : Message : 129.919846 s : Random gauge initialised 
 Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 129.923611 s : Setting up Cshift based reference 
 Grid : Message : 135.522878 s : *****************************************************************
 Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 135.522899 s : *****************************************************************
 Grid : Message : 135.522899 s : *****************************************************************
 Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 135.522901 s : * Vectorising space-time by 8
 Grid : Message : 135.522903 s : * VComplex size is 64 B
 Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
 Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 135.522908 s : *****************************************************************
 Grid : Message : 136.151202 s : Called warmup
 Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
 Grid : Message : 137.224748 s : mflop/s =   24755806
 Grid : Message : 137.224751 s : mflop/s per rank =  6188951.49
 Grid : Message : 137.224753 s : mflop/s per node =  24755806
 Grid : Message : 137.235239 s : norm diff   5.8108784e-14  Line 306
 Grid : Message : 146.451686 s : ----------------------------------------------------------------
 Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 146.451710 s : ----------------------------------------------------------------
 Grid : Message : 146.451712 s : Called DwDag
 Grid : Message : 146.451714 s : norm dag result 4.12810493
 Grid : Message : 146.452323 s : norm dag ref    4.12810493
 Grid : Message : 146.454799 s : norm dag diff   3.40632318e-14  Line 377
 Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 146.940894 s : src_e0.500003185
 Grid : Message : 146.953676 s : src_o0.499996882
 Grid : Message : 146.955927 s : *********************************************************
 Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 146.955932 s : * Vectorising space-time by 8
 Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
 Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 146.955941 s : *********************************************************
 Grid : Message : 147.511975 s : Deo mflop/s =   24036256.5
 Grid : Message : 147.511989 s : Deo mflop/s per rank   6009064.13
 Grid : Message : 147.511991 s : Deo mflop/s per node   24036256.5
 Grid : Message : 147.522100 s : r_e2.06377678
 Grid : Message : 147.522433 s : r_o2.06381058
 Grid : Message : 147.522745 s : res4.12758736
 Grid : Message : 148.229848 s : norm diff   0
 Grid : Message : 149.233474 s : norm diff even  0
 Grid : Message : 149.235815 s : norm diff odd   0
 Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 149.960990 s :  Testing without intranode communication 
 Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 149.960995 s : Grid Layout
 Grid : Message : 149.960995 s : 	Global lattice size  : 32 32 64 64 
 Grid : Message : 149.961003 s : 	OpenMP threads       : 4
 Grid : Message : 149.961004 s : 	MPI tasks            : 1 1 2 2 
 Grid : Message : 150.155810 s : Initialising 4d RNG
 Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 150.973420 s : Initialising 5d RNG
 Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 193.933765 s : Drawing gauge field
 Grid : Message : 204.611551 s : Random gauge initialised 
 Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 204.615265 s : Setting up Cshift based reference 
 Grid : Message : 210.117788 s : *****************************************************************
 Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 210.117809 s : *****************************************************************
 Grid : Message : 210.117810 s : *****************************************************************
 Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 210.117813 s : * Vectorising space-time by 8
 Grid : Message : 210.117814 s : * VComplex size is 64 B
 Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
 Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 210.117819 s : *****************************************************************
 Grid : Message : 210.714641 s : Called warmup
 Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
 Grid : Message : 211.892252 s : mflop/s =   22568003.2
 Grid : Message : 211.892255 s : mflop/s per rank =  5642000.8
 Grid : Message : 211.892257 s : mflop/s per node =  22568003.2
 Grid : Message : 211.896037 s : norm diff   5.8108784e-14  Line 306
 Grid : Message : 220.751375 s : ----------------------------------------------------------------
 Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 220.751409 s : ----------------------------------------------------------------
 Grid : Message : 220.751411 s : Called DwDag
 Grid : Message : 220.751412 s : norm dag result 4.12810493
 Grid : Message : 220.753307 s : norm dag ref    4.12810493
 Grid : Message : 220.755796 s : norm dag diff   3.40632318e-14  Line 377
 Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 221.697800 s : src_e0.500003185
 Grid : Message : 221.890920 s : src_o0.499996882
 Grid : Message : 221.913430 s : *********************************************************
 Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 221.913480 s : * Vectorising space-time by 8
 Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
 Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 221.913550 s : *********************************************************
 Grid : Message : 221.645213 s : Deo mflop/s =   24114032
 Grid : Message : 221.645228 s : Deo mflop/s per rank   6028508.01
 Grid : Message : 221.645231 s : Deo mflop/s per node   24114032
 Grid : Message : 221.656021 s : r_e2.06377678
 Grid : Message : 221.656389 s : r_o2.06381058
 Grid : Message : 221.656698 s : res4.12758736
 Grid : Message : 222.110075 s : norm diff   0
 Grid : Message : 222.857692 s : norm diff even  0
 Grid : Message : 222.875763 s : norm diff odd   0
 Grid : Message : 223.598127 s : *******************************************
 Grid : Message : 223.598145 s : ******* Grid Finalize                ******
 Grid : Message : 223.598146 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf.4node.perf
+++ b/systems/Jupiter/benchmarks/dwf.4node.perf
@@ -1,286 +0,0 @@
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
 RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
 RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
 RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
 SLURM detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 1 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 0 device 0 bus id: 0009:01:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 16
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 81604378624 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
 Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.838000 s :  Testing with full communication 
 Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 0.840000 s : Grid Layout
 Grid : Message : 0.840000 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 0.846000 s : 	OpenMP threads       : 4
 Grid : Message : 0.846000 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 0.165970 s : Initialising 4d RNG
 Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 0.960410 s : Initialising 5d RNG
 Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 local rank 2 device 0 bus id: 0029:01:00.0
 local rank 3 device 0 bus id: 0039:01:00.0
 local rank 1 device 0 bus id: 0019:01:00.0
 Grid : Message : 44.657270 s : Drawing gauge field
 Grid : Message : 55.247733 s : Random gauge initialised 
 Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 55.253053 s : Setting up Cshift based reference 
 Grid : Message : 62.191747 s : *****************************************************************
 Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 62.191768 s : *****************************************************************
 Grid : Message : 62.191769 s : *****************************************************************
 Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 62.191769 s : * Vectorising space-time by 8
 Grid : Message : 62.191770 s : * VComplex size is 64 B
 Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
 Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 62.191772 s : *****************************************************************
 Grid : Message : 62.857568 s : Called warmup
 Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
 Grid : Message : 65.582120 s : mflop/s =   48306525
 Grid : Message : 65.582140 s : mflop/s per rank =  3019157.81
 Grid : Message : 65.582150 s : mflop/s per node =  12076631.3
 Grid : Message : 65.637550 s : norm diff   5.80156793e-14  Line 306
 Grid : Message : 75.122153 s : ----------------------------------------------------------------
 Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 75.122167 s : ----------------------------------------------------------------
 Grid : Message : 75.122167 s : Called DwDag
 Grid : Message : 75.122167 s : norm dag result 4.12801829
 Grid : Message : 75.123295 s : norm dag ref    4.12801829
 Grid : Message : 75.125890 s : norm dag diff   3.42093991e-14  Line 377
 Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 75.605683 s : src_e0.500004005
 Grid : Message : 75.617824 s : src_o0.499996067
 Grid : Message : 75.620089 s : *********************************************************
 Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 75.620093 s : * Vectorising space-time by 8
 Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
 Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 75.620096 s : *********************************************************
 Grid : Message : 76.732272 s : Deo mflop/s =   48068252.4
 Grid : Message : 76.732283 s : Deo mflop/s per rank   3004265.77
 Grid : Message : 76.732285 s : Deo mflop/s per node   12017063.1
 Grid : Message : 76.749317 s : r_e2.06443136
 Grid : Message : 76.749652 s : r_o2.06378451
 Grid : Message : 76.749955 s : res4.12821587
 Grid : Message : 77.198827 s : norm diff   0
 Grid : Message : 77.981760 s : norm diff even  0
 Grid : Message : 78.455900 s : norm diff odd   0
 Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 78.539337 s :  Testing without internode communication 
 Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 78.539339 s : Grid Layout
 Grid : Message : 78.539339 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 78.539347 s : 	OpenMP threads       : 4
 Grid : Message : 78.539348 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 78.798501 s : Initialising 4d RNG
 Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 78.879916 s : Initialising 5d RNG
 Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 124.586264 s : Drawing gauge field
 Grid : Message : 135.338090 s : Random gauge initialised 
 Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 135.341266 s : Setting up Cshift based reference 
 Grid : Message : 142.604280 s : *****************************************************************
 Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 142.604460 s : *****************************************************************
 Grid : Message : 142.604470 s : *****************************************************************
 Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 142.604480 s : * Vectorising space-time by 8
 Grid : Message : 142.604500 s : * VComplex size is 64 B
 Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
 Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 142.604520 s : *****************************************************************
 Grid : Message : 142.686034 s : Called warmup
 Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
 Grid : Message : 144.868559 s : mflop/s =   48706194.1
 Grid : Message : 144.868561 s : mflop/s per rank =  3044137.13
 Grid : Message : 144.868562 s : mflop/s per node =  12176548.5
 Grid : Message : 144.887595 s : norm diff   5.80156793e-14  Line 306
 Grid : Message : 153.622978 s : ----------------------------------------------------------------
 Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 153.622995 s : ----------------------------------------------------------------
 Grid : Message : 153.622995 s : Called DwDag
 Grid : Message : 153.622996 s : norm dag result 4.12801829
 Grid : Message : 153.623604 s : norm dag ref    4.12801829
 Grid : Message : 153.626098 s : norm dag diff   3.42093991e-14  Line 377
 Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 154.148319 s : src_e0.500004005
 Grid : Message : 154.151454 s : src_o0.499996067
 Grid : Message : 154.153722 s : *********************************************************
 Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 154.153725 s : * Vectorising space-time by 8
 Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
 Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 154.153728 s : *********************************************************
 Grid : Message : 155.200671 s : Deo mflop/s =   51121022.4
 Grid : Message : 155.200682 s : Deo mflop/s per rank   3195063.9
 Grid : Message : 155.200684 s : Deo mflop/s per node   12780255.6
 Grid : Message : 155.217204 s : r_e2.06443136
 Grid : Message : 155.217550 s : r_o2.06378451
 Grid : Message : 155.217869 s : res4.12821587
 Grid : Message : 155.673744 s : norm diff   0
 Grid : Message : 156.463329 s : norm diff even  0
 Grid : Message : 156.878866 s : norm diff odd   0
 Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 157.620764 s :  Testing without intranode communication 
 Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
 Grid : Message : 157.620766 s : Grid Layout
 Grid : Message : 157.620766 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 157.620773 s : 	OpenMP threads       : 4
 Grid : Message : 157.620774 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 157.671479 s : Initialising 4d RNG
 Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 157.755651 s : Initialising 5d RNG
 Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 202.465158 s : Drawing gauge field
 Grid : Message : 213.214546 s : Random gauge initialised 
 Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
 Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
 Grid : Message : 213.217711 s : Setting up Cshift based reference 
 Grid : Message : 219.662772 s : *****************************************************************
 Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 219.662787 s : *****************************************************************
 Grid : Message : 219.662788 s : *****************************************************************
 Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 219.662789 s : * Vectorising space-time by 8
 Grid : Message : 219.662790 s : * VComplex size is 64 B
 Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
 Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 219.662791 s : *****************************************************************
 Grid : Message : 220.425592 s : Called warmup
 Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
 Grid : Message : 222.536267 s : mflop/s =   50365105.5
 Grid : Message : 222.536269 s : mflop/s per rank =  3147819.09
 Grid : Message : 222.536270 s : mflop/s per node =  12591276.4
 Grid : Message : 222.541053 s : norm diff   5.80156793e-14  Line 306
 Grid : Message : 232.135901 s : ----------------------------------------------------------------
 Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 232.135916 s : ----------------------------------------------------------------
 Grid : Message : 232.135917 s : Called DwDag
 Grid : Message : 232.135918 s : norm dag result 4.12801829
 Grid : Message : 232.151938 s : norm dag ref    4.12801829
 Grid : Message : 232.154451 s : norm dag diff   3.42093991e-14  Line 377
 Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 232.630529 s : src_e0.500004005
 Grid : Message : 232.643197 s : src_o0.499996067
 Grid : Message : 232.645527 s : *********************************************************
 Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO                
 Grid : Message : 232.645532 s : * Vectorising space-time by 8
 Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
 Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 232.645535 s : *********************************************************
 Grid : Message : 233.774184 s : Deo mflop/s =   47432091.9
 Grid : Message : 233.774194 s : Deo mflop/s per rank   2964505.74
 Grid : Message : 233.774196 s : Deo mflop/s per node   11858023
 Grid : Message : 233.791552 s : r_e2.06443136
 Grid : Message : 233.791899 s : r_o2.06378451
 Grid : Message : 233.792204 s : res4.12821587
 Grid : Message : 234.230783 s : norm diff   0
 Grid : Message : 235.162780 s : norm diff even  0
 Grid : Message : 235.291950 s : norm diff odd   0
 Grid : Message : 235.765411 s : *******************************************
 Grid : Message : 235.765424 s : ******* Grid Finalize                ******
 Grid : Message : 235.765425 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf1.slurm
+++ b/systems/Jupiter/benchmarks/dwf1.slurm
@@ -1,57 +0,0 @@
 #!/bin/sh
 #SBATCH --account=jureap14
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=64
 #SBATCH --time=2:00:00
 #SBATCH --partition=booster
 #SBATCH --gres=gpu:4
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 OPT="--comms-overlap"
 source ../sourceme.sh
 cat << EOF > bind_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export NUMA_MAP=(0 1 2 3)
 export NIC_MAP=(0 1 2 3)
 export GPU=\$SLURM_LOCALID
 export NUMA=\$SLURM_LOCALID
 export NIC=\$SLURM_LOCALID
 export CUDA_VISIBLE_DEVICES=\$GPU
 export UCX_NET_DEVICES=mlx5_\${NIC}:1
 echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./bind_gpu
 srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
        ./bind_gpu ./Benchmark_dwf_fp32 \
 	$OPT \
 	--mpi 1.1.2.2 \
 	--accelerator-threads 8 \
 	--grid 32.32.64.64 \
 	--shm 2048 > dwf.1node.perf
 srun --cpu-bind=no -N 1  -n $SLURM_NTASKS \
 	./bind_gpu ./Benchmark_comms_host_device \
 	--mpi 1.1.2.2 \
 	--accelerator-threads 8 \
 	--grid 32.32.64.64 \
 	--shm 2048 > comms.1node.perf
--- a/systems/Jupiter/benchmarks/dwf4.slurm
+++ b/systems/Jupiter/benchmarks/dwf4.slurm
@@ -1,57 +0,0 @@
 #!/bin/sh
 #SBATCH --account=jureap14
 #SBATCH --nodes=4
 #SBATCH --ntasks=16
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=64
 #SBATCH --time=2:00:00
 #SBATCH --partition=booster
 #SBATCH --gres=gpu:4
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 OPT="--comms-overlap"
 source ../sourceme.sh
 cat << EOF > bind_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3)
 export NUMA_MAP=(0 1 2 3)
 export NIC_MAP=(0 1 2 3)
 export GPU=\$SLURM_LOCALID
 export NUMA=\$SLURM_LOCALID
 export NIC=\$SLURM_LOCALID
 export CUDA_VISIBLE_DEVICES=\$GPU
 export UCX_NET_DEVICES=mlx5_\${NIC}:1
 echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./bind_gpu
 srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
        ./bind_gpu ./Benchmark_dwf_fp32 \
 	$OPT \
 	--mpi 2.2.2.2 \
 	--accelerator-threads 8 \
 	--grid 64.64.64.64 \
 	--shm 2048 > dwf.4node.perf
 srun --cpu-bind=no -N 4  -n $SLURM_NTASKS \
 	./bind_gpu ./Benchmark_comms_host_device \
 	--mpi 2.2.2.2 \
 	--accelerator-threads 8 \
 	--grid 32.32.64.64 \
 	--shm 2048 > comms.4node.perf
--- a/systems/Jupiter/config-command
+++ b/systems/Jupiter/config-command
@@ -1,16 +0,0 @@
 export CXX=nvcc
 export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
 export LDFLAGS="-cudart shared -L${OPENMPI}/lib" 
 export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-gen-simd-width=64 \
    --enable-shm=nvlink \
    --enable-accelerator=cuda \
    --with-lime=$CLIME \
    --disable-gparity \
    --disable-fermion-reps \
    --disable-unified 
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@@ -1,10 +0,0 @@
 CLIME=$HOME/install/
 module load Clang
 module load CUDA
 module load FFTW
 module load OpenSSL
 module load MPFR
 module load NVHPC
 module load UCX
 module load OpenMPI
 ulimit -c 0
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -7,6 +7,8 @@ CXX=mpicxx ../../configure \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
 	   --with-hdf5=$HDF5 \
 	   --with-fftw=$FFTW \
 	   --with-openssl=$OPENSSL \
 	   --with-gmp=$GMP \
 	   --with-mpfr=$MPFR \
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@@ -1,12 +1,3 @@
 spack load c-lime
 spack load fftw
 spack load hdf5+cxx
 export FFTW=`spack find --paths fftw       | grep ^fftw   | awk '{print $2}' `
 export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
 export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
@@ -14,16 +5,12 @@ export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
--with-lime=$CLIME \
+--disable-accelerator-cshift \
 --with-hdf5=$HDF5 \
 --with-fftw=$FFTW \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
-LIBS=-llime \
+CXXFLAGS="-std=c++17"
 LDFLAGS=-L$CLIME/lib/ \
 CXXFLAGS="-std=c++17 -fPIE"
--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@@ -1,5 +1,4 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
-module load openmpi/4.1.8
+module load openmpi
 spack load c-lime
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -31,23 +31,16 @@ directory
 using namespace std;
 using namespace Grid;
 ;
-//typedef WilsonFermionD FermionOp;
+#if 0
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
-
+#else
-template <class T> void writeFile(T& in, std::string const fname){  
+typedef MobiusFermionD FermionOp;
-#ifdef HAVE_LIME
+typedef typename MobiusFermionD::FermionField FermionField;
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
-  // What is the appropriate way to throw error?
+
 }
 RealD AllZero(RealD x) { return 0.; }
@@ -132,7 +125,7 @@ int main(int argc, char** argv) {
  int Ls=16;
  RealD M5=1.8;
-  RealD mass = 0.01;
+  RealD mass = -1.0;
  mass=LanParams.mass;
  Ls=LanParams.Ls;
@@ -170,21 +163,22 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
  int Nk = 20;
  int Nstop = Nk;
  int Np = 80;
  int Nstop = 10;
  int Nk = 20;
  int Np = 80;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  int Nm = Nk + Np;
-  int MaxIt = 100;
+  int MaxIt = 10000;
-  RealD resid = 1.0e-4;
+  RealD resid = 1.0e-5;
-
+  RealD mob_b=1.5;
 //while ( mass > - 5.0){
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -212,12 +206,8 @@ int main(int argc, char** argv) {
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
-  std::cout << mass <<" : " << eval        << std::endl;
+  std::cout << mass <<" : " << eval << std::endl;
-  std::cout << " #evecs "   << evec.size() << std::endl;
+
  std::cout << " Nconv  "   << Nconv       << std::endl;
  std::cout << " Nm     "   << Nm          << std::endl;
  if ( Nconv > evec.size() ) Nconv = evec.size();
 #if 0
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
@@ -247,7 +237,6 @@ int main(int argc, char** argv) {
  vector<LatticeFermion> finalevec(Nconv, FGrid);
  vector<RealD> eMe(Nconv), eMMe(Nconv);
  for(int i = 0; i < Nconv; i++){
    cout << "calculate the matrix element["<<i<<"]" << endl;
    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -314,7 +303,7 @@ int main(int argc, char** argv) {
      }
    }
  }
-  for(int i = 0; i < Nconv; i++){
+    for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -322,7 +311,6 @@ int main(int argc, char** argv) {
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;
 //  vector<LatticeFermion> finalevec(Nconv, FGrid);
 // temporary, until doing rotation
@@ -343,41 +331,13 @@ int main(int argc, char** argv) {
      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
    }
  }
  for(int i = 0; i < Nconv; i++){
    Ddwf.M(finalevec[i], G5R5Mevec[i]);
    for(int j = 0; j < Nconv; j++){
      std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
    }
  }
  for(int i = 0; i < Nconv; i++){
    RealD t1,t2;
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
    for(int j = 0; j < Nconv; j++){
      std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
    }
  }
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
    std::string evfile("./evec_density");
    evfile = evfile+"_"+std::to_string(i);
    auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
    writeFile(evdensity,evfile);
    for(int j = 0; j < Nconv; j++){
      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
      std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
      if ( chiral_matrix_real[i][j] > 0.8 ) {
 	auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
 	std::string chfile("./chiral_density_");
 	chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
 	writeFile(g5density,chfile);
      }
    }
  }
  for(int i = 0; i < Nconv; i++){
@@ -386,43 +346,6 @@ int main(int argc, char** argv) {
    }
  }
-  FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
+
 #define PYTHON_LINE(A)  fprintf(fp,A"\n");
  PYTHON_LINE("import matplotlib.pyplot as plt");
  PYTHON_LINE("import numpy as np");
  PYTHON_LINE("");
  PYTHON_LINE("fig = plt.figure()");
  PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
  PYTHON_LINE("");
  PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
  fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
  PYTHON_LINE("");
  PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
  PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
  PYTHON_LINE("xpos = xpos.ravel()");
  PYTHON_LINE("ypos = ypos.ravel()");
  PYTHON_LINE("zpos = 0");
  PYTHON_LINE("");
  PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
  PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
  PYTHON_LINE("dz = np.array([");
  for(int i = 0; i < Nconv; i++){
    fprintf(fp,"\t[ ");
    for(int j = 0; j < Nconv; j++){
      fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
      if(j<Nconv-1) fprintf(fp,",");
      else          fprintf(fp," ");
    }
    fprintf(fp,"]");
    if(i<Nconv-1) fprintf(fp,",\n");
    else          fprintf(fp,"\n");
  }
  PYTHON_LINE("\t])");
  PYTHON_LINE("dz = dz.ravel()");
  PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
  PYTHON_LINE("plt.show()");
  fclose(fp);
  Grid_finalize();
 }
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -113,6 +113,9 @@ struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -204,7 +207,6 @@ int main(int argc, char** argv) {
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
@@ -226,10 +228,14 @@ int main(int argc, char** argv) {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
-
+  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  mass=LanParams.mass;
  resid=LanParams.resid;
  int Nm = Nk + Np;
 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -27,6 +27,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 using namespace std;
 using namespace Grid;
@@ -38,11 +39,28 @@ typedef typename WilsonFermionD::FermionField FermionField;
 RealD AllZero(RealD x) { return 0.; }
 template <class T> void writeFile(T& in, std::string const fname){
 #if 1
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -158,6 +176,10 @@ int main(int argc, char** argv) {
  }
  mass=LanParams.mass;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  Nm = Nk + Np;
 while ( mass > - 5.0){
@@ -202,6 +224,12 @@ while ( mass > - 5.0){
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
 //    if ( i<1)
    {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
        auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(evdensity,evfile);
    }
  }
  src  = evec[0]+evec[1]+evec[2];
  mass += -0.1;
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 project(GridViewer)
-list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/")
+list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
 find_package(VTK COMPONENTS 
  CommonColor
--- a/visualisation/FieldDensityAnimate.cxx
+++ b/visualisation/FieldDensityAnimate.cxx
@@ -48,14 +48,15 @@ typedef vtkMarchingCubes isosurface;
 int mpeg = 0 ;
 int xlate = 0 ;
 int framerate = 10;
 template <class T> void readFile(T& out, std::string const fname){
 #ifdef HAVE_LIME
  Grid::emptyUserRecord record;
  Grid::ScidacReader RD;
  RD.open(fname);
  RD.readScidacFieldRecord(out,record);
  RD.close();
 #endif
 }
 using namespace Grid;
@@ -207,10 +208,6 @@ int main(int argc, char* argv[])
    xlate = 1;
  }
  if( GridCmdOptionExists(argv,argv+argc,"--fps") ){
    arg=GridCmdOptionPayload(argv,argv+argc,"--fps");
    GridCmdOptionInt(arg,framerate);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){
    arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface");
    GridCmdOptionFloat(arg,default_contour);
@@ -423,7 +420,7 @@ int main(int argc, char* argv[])
    vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New();
    writer->SetFileName("movie.avi");
-    writer->SetRate(framerate);
+    writer->SetRate(1);
    writer->SetInputConnection(imageFilter->GetOutputPort());
    writer->Start();
@@ -480,7 +477,7 @@ int main(int argc, char* argv[])
    slidercallback->fu_list = fu_list;
    sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback);
-    int timerId = iren->CreateRepeatingTimer(1000/framerate);
+    int timerId = iren->CreateRepeatingTimer(300);
    std::cout << "timerId: " << timerId << std::endl;
    // Start the interaction and timer
--- a/visualisation/README
+++ b/visualisation/README
@@ -73,21 +73,6 @@ each to:
   VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.
 ========================================
 Aurora compilation:
 ========================================
 module load ffmpeg
 download & untar: VTK-7.0.2
 mkdir build
 cd build 
 ccmake ../
 "t"
 Enable: VTK_MODULE_ENABLE_VTK_IOFFMPEG   YES   
 "configure" ; should "discover" the installed ffmpeg module
 Still need an "X" connection to make the MPEG files.
 ========================================
 Grid:
@@ -125,29 +110,4 @@ Extensions
 8) Example python code: FieldDensity.py . This is not interfaced to Grid.
 ================
 Windowless generation of AVI files: must enable offscreen rendering. From Shuhei Yamamoto:
 ================
 Hi Peter,
 To make visualization work on Frontier, I did the following.
 For headless off-screen rendering, ccmake tabs in advanced mode shown below are set as indicated.
 VTK_OPENGL_HAS_* off  
 VTK_USE_X off  
 VTK_DEFAULT_RENDER_WINDOW_OFFSCREEN on
 VTK_DEFAULT_RENDER_WINDOW_HEADLESS on 
 The list can be greater than necessary.
 VTK can fall back to EGL or OSMesa at runtime. So I installed mesa via spack (as well as nasm and yasm).  Either mesa or meson package requires llvm-config, which is included after rocm6.1.  On Frontier, I used /opt/rocm-6.2.4.  The only problem is that llvm-config is located on /opt/rocm-6.2.4/llvm/bin, instead of /opt/rocm-6.2.4/bin.  So I edited packages.yaml for spack so that the prefix for rocm compiler is /opt/rocm-6.2.4/llvm.  Just in case, I also changed c and cxx to /opt/rocm-6.2.4/llvm/bin/amdclang, amdclang++, respectively, but this change might not be necessary. 
 After installation, I added a path to libOSMesa.so to LD_LIBRARY_PATH, for which there might be a better way such as specifying -rpath for OSMesa lib by editing cmake files.
 In addition, I have editied CMakeLists.txt for vtk to force vtk to find OSMesa package via find_package(OSMesa REQUIRED) after list(INSERT CMAKE_MODULE_PATH 0 "${vtk_cmake_dir}"), as there is Find package in vtk/CMake.  There will be more elegant method, but I was not able to find a tab to switch on OSMesa. 
 When I compiled vtk and linked to Grid visualization code, with ffmpeg option, it produces avi file.
 Best,
 Shuhei
--- a/visualisation/cmake-command
+++ b/visualisation/cmake-command
@@ -1,17 +1,9 @@
-export grid_config=/home/paboyle/GPT/install/bin/grid-config
+libs=`grid-config --libs`
-libs=`$grid_config --libs`
+ldflags=`grid-config --ldflags`
-ldflags=`$grid_config --ldflags`
+cxxflags=`grid-config --cxxflags`
-cxxflags=`$grid_config --cxxflags`
+cxx=`grid-config --cxx`
 cxx=`$grid_config --cxx`
 cc=icx
 mkdir build
 cd build
-echo CC $cc
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags 
 echo CXX $cxx
 echo CXXFLAGS $cxxflags
 echo LDFLAGS  $ldflags
 echo LIBS  $libs
 LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "
Author	SHA1	Message	Date
Chulwoo Jung	8419cc5c64	specflow evec I/O added,	2025-07-11 15:57:23 -04:00
Chulwoo Jung	2cc6deb8e0	Merge branch 'develop' of https://github.com/paboyle/Grid into ic2	2025-04-25 10:48:41 -04:00
Chulwoo Jung	19d0590579	Checking in for merging	2025-04-25 10:48:22 -04:00