Large reg file for double

ifdef guard ommitted
Acclerator ware MPI guard on the Unix domain sockets
2025-06-20 08:46:55 +01:00 · 2024-03-26 17:03:20 +00:00 · 2024-03-26 14:50:32 +00:00 · 2024-03-26 14:41:25 +00:00 · 2024-03-26 14:14:06 +00:00 · 2024-03-26 14:11:40 +00:00
48 changed files with 1497 additions and 383 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@ -34,7 +34,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-//#undef EIGEN_USE_SYCL
+#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #define _GRID_FFT_H_
 #ifdef HAVE_FFTW
-#ifdef USE_MKL
+#if defined(USE_MKL) || defined(GRID_SYCL)
 #include <fftw/fftw3.h>
 #else
 #include <fftw3.h>
--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */
-zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  construct_partfrac(d);
  construct_contfrac(d);
-  /* Converting everything to PRECISION for external use only */
+  /* Converting everything to ZOLO_PRECISION for external use only */
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
+  zd -> A = (ZOLO_PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;
-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
  free(d -> a);
-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
  free(d -> ap);
-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
  free(d -> alpha);
-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
  free(d -> beta);
-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
  free(d -> gamma);
  free(d);
@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }
-zolotarev_data* higham(PRECISION epsilon, int n) {
+zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
  /* Converting everything to PRECISION for external use only */
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
+  zd -> A = (ZOLO_PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;
-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
  free(d -> a);
-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
  free(d -> ap);
-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
  free(d -> alpha);
-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
  free(d -> beta);
-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
  free(d -> gamma);
  free(d);
@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST
 #undef ZERO
-#define ZERO ((PRECISION) 0)
+#define ZERO ((ZOLO_PRECISION) 0)
 #undef ONE
-#define ONE ((PRECISION) 1)
+#define ONE ((ZOLO_PRECISION) 1)
 #undef TWO
-#define TWO ((PRECISION) 2)
+#define TWO ((ZOLO_PRECISION) 2)
 /* Evaluate the rational approximation R(x) using the factored form */
-static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R;
+  ZOLO_PRECISION R;
  if (rdata -> type == 0) {
    R = rdata -> A * x;
@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
 /* Evaluate the rational approximation R(x) using the partial fraction form */
-static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R = rdata -> alpha[rdata -> da - 1];
+  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */
-static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R = rdata -> beta[0] * x;
+  ZOLO_PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    
 /* Evaluate the rational approximation R(x) using Cayley form */
-static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION T;
+  ZOLO_PRECISION T;
  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
@ -607,7 +607,7 @@ int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
-  PRECISION y;
+  ZOLO_PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;
@ -626,13 +626,13 @@ int main(int argc, char** argv) {
  }
  rdata = type == 2 
-    ? higham((PRECISION) eps, n) 
+    ? higham((ZOLO_PRECISION) eps, n) 
-    : zolotarev((PRECISION) eps, n, type);
+    : zolotarev((ZOLO_PRECISION) eps, n, type);
  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tPRECISION = " STRINGIFY(PRECISION)
+	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@ -681,15 +681,15 @@ int main(int argc, char** argv) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((PRECISION) x, rdata);
+      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
 #ifndef ZOLOTAREV_INTERNAL
-#ifndef PRECISION
+#ifndef ZOLO_PRECISION
-#define PRECISION double
+#define ZOLO_PRECISION double
 #endif
-#define ZPRECISION PRECISION
+#define ZPRECISION ZOLO_PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif
@ -77,8 +77,8 @@ typedef struct {
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */
-ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif
@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/blas/BatchedBlas.cc
+++ b/Grid/algorithms/blas/BatchedBlas.cc
@ -0,0 +1,34 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: BatchedBlas.h
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/algorithms/blas/BatchedBlas.h>
 NAMESPACE_BEGIN(Grid);
 gridblasHandle_t GridBLAS::gridblasHandle;
 int              GridBLAS::gridblasInit;
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -31,12 +31,17 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
-#include <hipblas/hipblas.h>
+#include <cublas_v2.h>
 #endif
 #ifdef GRID_SYCL
-#error // need oneMKL version
+#include <oneapi/mkl.hpp>
 #endif
 #if 0
 #define GRID_ONE_MKL
 #endif
 #ifdef GRID_ONE_MKL
 #include <oneapi/mkl.hpp>
 #endif
 ///////////////////////////////////////////////////////////////////////	  
 // Need to rearrange lattice data to be in the right format for a
 // batched multiply. Might as well make these static, dense packed
@ -46,12 +51,15 @@ NAMESPACE_BEGIN(Grid);
  typedef hipblasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_CUDA
-  typedef cudablasHandle_t gridblasHandle_t;
+  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef int32_t gridblasHandle_t;
+  typedef cl::sycl::queue *gridblasHandle_t;
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+#ifdef GRID_ONE_MKL
  typedef cl::sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
 #endif
@ -70,12 +78,19 @@ public:
 #ifdef GRID_CUDA
      std::cout << "cublasCreate"<<std::endl;
      cublasCreate(&gridblasHandle);
      cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
 #endif
 #ifdef GRID_HIP
      std::cout << "hipblasCreate"<<std::endl;
      hipblasCreate(&gridblasHandle);
 #endif
 #ifdef GRID_SYCL
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
      cl::sycl::cpu_selector selector;
      cl::sycl::device selectedDevice { selector };
      gridblasHandle =new sycl::queue (selectedDevice);
 #endif
      gridblasInit=1;
    }
@ -110,6 +125,9 @@ public:
 #endif
 #ifdef GRID_SYCL
    accelerator_barrier();
 #endif
 #ifdef GRID_ONE_MKL
    gridblasHandle->wait();
 #endif
  }
@ -615,9 +633,10 @@ public:
    deviceVector<ComplexD> beta_p(1);
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
-    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+
-    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
+    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
+    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
 #ifdef GRID_HIP
    auto err = hipblasZgemmStridedBatched(gridblasHandle,
 					  HIPBLAS_OP_N,
@ -643,10 +662,19 @@ public:
 			      (cuDoubleComplex *) Cmn, ldc, sdc,
 			      batchCount);
 #endif
-#ifdef GRID_SYCL
+#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
-     #warning "oneMKL implementation not made "
+    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						oneapi::mkl::transpose::N,
 						oneapi::mkl::transpose::N,
 						m,n,k,
 						alpha,
 						(const ComplexD *)Amk,lda,sda,
 						(const ComplexD *)Bkn,ldb,sdb,
 						beta,
 						(ComplexD *)Cmn,ldc,sdc,
 						batchCount);
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
     // Need a default/reference implementation
     for (int p = 0; p < batchCount; ++p) {
       for (int mm = 0; mm < m; ++mm) {
@ -672,21 +700,23 @@ public:
    ComplexD alpha(1.0);
    ComplexD beta (1.0);
    RealD flops = 8.0*M*N*K*BATCH;
-    for(int i=0;i<10;i++){
+    int ncall=10;
-      RealD t0 = usecond();
+    RealD t0 = usecond();
-	gemmStridedBatched(M,N,K,
+    for(int i=0;i<ncall;i++){
-			   alpha,
+      gemmStridedBatched(M,N,K,
-			   &A[0], // m x k 
+			 alpha,
-			   &B[0], // k x n
+			 &A[0], // m x k 
-			   beta, 
+			 &B[0], // k x n
-			   &C[0], // m x n
+			 beta, 
-			   BATCH);
+			 &C[0], // m x n
-      synchronise();
+			 BATCH);
      RealD t1 = usecond();
      RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
      flops = flops/(t1-t0)/1.e3;
    }
-    return flops;
+    synchronise();
    RealD t1 = usecond();
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
    flops = 8.0*M*N*K*BATCH*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
  return offbytes;
 }
 #undef NVLINK_GET // Define to use get instead of put DMA
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
 #ifdef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
 #endif
  }
  if (dox) {
    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
 #ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 #endif
    }
  }
@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 {
  int nreq=list.size();
  acceleratorCopySynchronise();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@ -40,6 +40,9 @@ int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;
 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
 #ifndef ACCELERATOR_AWARE_MPI
 void * GlobalSharedMemory::HostCommBuf;
 #endif
 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
 void *SharedMemory::HostBufferMalloc(size_t bytes){
  void *ptr = (void *)host_heap_top;
  host_heap_top  += bytes;
  host_heap_bytes+= bytes;
  if (host_heap_bytes >= host_heap_size) {
    std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
    std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
    std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
    assert(host_heap_bytes<host_heap_size);
  }
  return ptr;
 }
 void SharedMemory::HostBufferFreeAll(void) { 
  host_heap_top  =(size_t)HostCommBuf;
  host_heap_bytes=0;
 }
 #endif
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -75,7 +75,9 @@ public:
  static int           Hugepages;
  static std::vector<void *> WorldShmCommBufs;
-
+#ifndef ACCELERATOR_AWARE_MPI
  static void *HostCommBuf;
 #endif
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
@ -120,6 +122,13 @@ private:
  size_t heap_bytes;
  size_t heap_size;
 #ifndef ACCELERATOR_AWARE_MPI
  size_t host_heap_top;  // set in free all
  size_t host_heap_bytes;// set in free all
  void *HostCommBuf;     // set in SetCommunicator
  size_t host_heap_size; // set in SetCommunicator
 #endif
 protected:
  Grid_MPI_Comm    ShmComm; // for barriers
@ -151,7 +160,10 @@ public:
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
-  
+#ifndef ACCELERATOR_AWARE_MPI
  void *HostBufferMalloc(size_t bytes);
  void HostBufferFreeAll(void);
 #endif  
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -39,9 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #include <hip/hip_runtime_api.h>
 #endif
 #ifdef GRID_SYCL
 #ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
 #endif 
 #include <syscall.h>
 #define SHM_SOCKETS 
 #endif
 #include <sys/socket.h>
@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
 #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
 //if defined(GRID_SYCL)
 #if 0
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the pointer array for shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  SharedMemoryZero(ShmCommBuf,bytes);
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    WorldShmCommBufs[r] = ShmCommBuf;
  }
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #endif
 #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)  
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
  HostCommBuf= malloc(bytes);
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #endif
 #else 
 #ifdef GRID_MPI3_SHMMMAP
@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
  ShmBufferFreeAll();
 #ifndef ACCELERATOR_AWARE_MPI
  host_heap_size = heap_size;
  HostCommBuf= GlobalSharedMemory::HostCommBuf;
  HostBufferFreeAll();
 #endif  
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
  }
 }
-template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
-#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -281,11 +281,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  return nrm;
 }
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
 #ifdef GRID_SYCL
  uint64_t csum=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // Hack
    // Fast integer xor checksum. Can also be used in comms now.
    autoView(l_v,left,AcceleratorRead);
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
    uint64_t *base= (uint64_t *)&l_v[0];
    csum=svm_xor(base,words);
  }
  FlightRecorder::CsumLog(csum);
 #endif
  ComplexD nrm = rankInnerProduct(left,right);
  RealD local = real(nrm);
  FlightRecorder::NormLog(real(nrm)); 
  grid->GlobalSum(nrm);
  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
  return result;
 }
 NAMESPACE_END(Grid);
-/*
+template<class Word> Word svm_xor(Word *vec,uint64_t L)
 template<class Double> Double svm_reduce(Double *vec,uint64_t L)
 {
-  Double sumResult; zeroit(sumResult);
+  Word xorResult; xorResult = 0;
-  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
+  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
-  Double identity;  zeroit(identity);
+  Word identity;  identity=0;
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
+     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum +=vec[index];
+	 sum ^=vec[index];
     });
   });
  theGridAccelerator->wait();
-  Double ret = d_sum[0];
+  Word ret = d_sum[0];
  free(d_sum,*theGridAccelerator);
  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
  return ret;
 }
 NAMESPACE_END(Grid);
 /*
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -411,7 +411,7 @@ public:
      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
-  void SeedFixedIntegers(const std::vector<int> &seeds){
+  void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
    // Everyone generates the same seed_seq based on input seeds
    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@ -428,7 +428,6 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int gidx;
@ -449,29 +448,12 @@ public:
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
-	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+	if ( britney ) { 
-    });
+	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
-#else
+	} else { 	
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
 	int i_idx;
 	Coordinate gcoor;
 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	// If this is one of mine we take it
 	if( rank == _grid->ThisRank() ){
 	  int l_idx=generator_idx(o_idx,i_idx);
 	  _generators[l_idx] = master_engine;
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
 #ifndef GRID_CUDA
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
 #endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
 #endif
  }
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);
   if( interior && exterior ) {
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v ,st,AcceleratorRead);
   if( interior && exterior ) {
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
 #ifndef GRID_CUDA
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 NAMESPACE_END(Grid);
 #ifdef GRID_SYCL
 template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
 template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
 #endif
 #endif
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -70,57 +70,6 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);
 /*
 template<class vobj,class cobj,class compressor>
 void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline));
 template<class vobj,class cobj,class compressor>
 void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 {
  int num=table.size();
  std::pair<int,int> *table_v = & table[0];
  auto rhs_v = rhs.View(AcceleratorRead);
  accelerator_forNB( i,num, vobj::Nsimd(), {
    compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
  });
  rhs_v.ViewClose();
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
 				 commVector<cobj *> pointers,
 				 int dimension,int plane,
 				 int cbmask,compressor &compress,int type) __attribute__((noinline));
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
 				 const Lattice<vobj> &rhs,
 				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
 				 compressor &compress,int type)
 {
  assert( (table.size()&0x1)==0);
  int num=table.size()/2;
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
  auto rhs_v = rhs.View(AcceleratorRead);
  auto rhs_p = &rhs_v[0];
  auto p0=&pointers[0][0];
  auto p1=&pointers[1][0];
  auto tp=&table[0];
  accelerator_forNB(j, num, vobj::Nsimd(), {
      compress.CompressExchange(p0,p1, rhs_p, j,
 				so+tp[2*j  ].second,
 				so+tp[2*j+1].second,
 				type);
  });
  rhs_v.ViewClose();
 }
 */
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@ -258,6 +207,10 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
 #ifndef ACCELERATOR_AWARE_MPI
    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
 #endif
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
@ -324,7 +277,7 @@ public:
  Vector<int> surface_list;
  stencilVector<StencilEntry>  _entries; // Resident in managed memory
-  commVector<StencilEntry>     _entries_device; // Resident in managed memory
+  commVector<StencilEntry>     _entries_device; // Resident in device memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
@ -408,33 +361,16 @@ public:
  // Use OpenMP Tasks for cleaner ???
  // must be called *inside* parallel region
  //////////////////////////////////////////
  /*
  void CommunicateThreaded()
  {
 #ifdef GRID_OMP
    int mythread = omp_get_thread_num();
    int nthreads = CartesianCommunicator::nCommThreads;
 #else
    int mythread = 0;
    int nthreads = 1;
 #endif
    if (nthreads == -1) nthreads = 1;
    if (mythread < nthreads) {
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 						      Packets[i].to_rank,
 						      Packets[i].recv_buf,
 						      Packets[i].from_rank,
 						      Packets[i].bytes,i);
      }
    }
  }
  */
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
 #ifdef ACCELERATOR_AWARE_MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@ -443,16 +379,54 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
 #else
 #warning "Using COPY VIA HOST BUFFERS IN STENCIL"
    for(int i=0;i<Packets.size();i++){
      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
      if ( Packets[i].do_send ) {
 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
      }
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].host_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].host_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
 #endif
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
 	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
    }
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    acceleratorCopySynchronise();
+    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    _grid->StencilBarrier(); 
 #ifndef ACCELERATOR_AWARE_MPI
 #warning "Using COPY VIA HOST BUFFERS IN STENCIL"
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv ) {
 	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
      }
    }
    _grid->HostBufferFreeAll();
 #endif
    // run any checksums
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
    }
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@ -528,6 +502,7 @@ public:
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    assert(source.Grid()==_grid);
@ -540,10 +515,9 @@ public:
      compress.Point(point);
      HaloGatherDir(source,compress,point,face_idx);
    }
-    accelerator_barrier();
+    accelerator_barrier(); // All my local gathers are complete
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
  /////////////////////////
@ -579,6 +553,7 @@ public:
      accelerator_forNB(j, words, cobj::Nsimd(), {
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
    }
  }
@ -669,6 +644,7 @@ public:
    for(int i=0;i<dd.size();i++){
      decompressor::DecompressFace(decompress,dd[i]);
    }
    acceleratorFenceComputeStream(); // dependent kernels
  }
  ////////////////////////////////////////
  // Set up routines
@ -1224,7 +1200,6 @@ public:
 	  ///////////////////////////////////////////////////////////
 	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid);
  };
 NAMESPACE_END(Grid);
 #ifdef GRID_SYCL
 template<typename T> struct
 sycl::is_device_copyable<T, typename std::enable_if<
 			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
 			      void>::type>
  : public std::true_type {};
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -255,17 +255,13 @@ inline int  acceleratorIsCommunicable(void *ptr)
 #define GRID_SYCL_LEVEL_ZERO_IPC
 NAMESPACE_END(Grid);
-#if 0
+
-#include <CL/sycl.hpp>
+// Force deterministic reductions
-#include <CL/sycl/usm.hpp>
+#define SYCL_REDUCTION_DETERMINISTIC
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
 #else
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
 NAMESPACE_BEGIN(Grid);
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@ -0,0 +1,339 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/Init.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@MacBook-Pro.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////
 // Grid Norm logging for repro testing
 ///////////////////////////////////////////////////////
 int FlightRecorder::PrintEntireLog;
 int FlightRecorder::ContinueOnFail;
 int FlightRecorder::LoggingMode;
 int FlightRecorder::ChecksumComms;
 int FlightRecorder::ChecksumCommsSend;
 int32_t  FlightRecorder::XmitLoggingCounter;
 int32_t  FlightRecorder::RecvLoggingCounter;
 int32_t  FlightRecorder::CsumLoggingCounter;
 int32_t  FlightRecorder::NormLoggingCounter;
 int32_t  FlightRecorder::ReductionLoggingCounter;
 uint64_t FlightRecorder::ErrorCounter;
 std::vector<double> FlightRecorder::NormLogVector;
 std::vector<double> FlightRecorder::ReductionLogVector;
 std::vector<uint64_t> FlightRecorder::CsumLogVector;
 std::vector<uint64_t> FlightRecorder::XmitLogVector;
 std::vector<uint64_t> FlightRecorder::RecvLogVector;
 void FlightRecorder::ResetCounters(void)
 {
  XmitLoggingCounter=0;
  RecvLoggingCounter=0;
  CsumLoggingCounter=0;
  NormLoggingCounter=0;
  ReductionLoggingCounter=0;
 }
 void FlightRecorder::Truncate(void)
 {
  ResetCounters();
  XmitLogVector.resize(0);
  RecvLogVector.resize(0);
  NormLogVector.resize(0);
  CsumLogVector.resize(0);
  ReductionLogVector.resize(0);
 }
 void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
 {
  switch ( mode ) {
  case LoggingModePrint:
    SetLoggingModePrint();
    break;
  case LoggingModeRecord:
    SetLoggingModeRecord();
    break;
  case LoggingModeVerify:
    SetLoggingModeVerify();
    break;
  case LoggingModeNone:
    LoggingMode = mode;
    Truncate();
    break;
  default:
    assert(0);
  }
 }
 void FlightRecorder::SetLoggingModePrint(void)
 {
  std::cout << " FlightRecorder: set to print output " <<std::endl;
  Truncate();
  LoggingMode = LoggingModePrint;
 }
 void FlightRecorder::SetLoggingModeRecord(void)
 {
  std::cout << " FlightRecorder: set to RECORD " <<std::endl;
  Truncate();
  LoggingMode = LoggingModeRecord;
 }
 void FlightRecorder::SetLoggingModeVerify(void)
 {
  std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
  ResetCounters();
  LoggingMode = LoggingModeVerify;
 }
 uint64_t FlightRecorder::ErrorCount(void)
 {
  return ErrorCounter;
 }
 void FlightRecorder::NormLog(double value)
 {
  uint64_t hex = * ( (uint64_t *)&value );
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLoggingCounter++;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLogVector.push_back(value);
    NormLoggingCounter++;
  }
  if(LoggingMode == LoggingModeVerify) {
    if(NormLoggingCounter < NormLogVector.size()){
      uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
 	std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		NormLoggingCounter,NormLogVector.size(),
 		value, NormLogVector[NormLoggingCounter]); fflush(stderr);
 	if(!ContinueOnFail)assert(0); // Force takedown of job
 	ErrorCounter++;
      } else {
 	if ( PrintEntireLog ) { 
 	  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
 		   <<" "<<hex<<" "<<hexref
 		   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
 	}
      }
    }
    if ( NormLogVector.size()==NormLoggingCounter ) {
      std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
    }
    NormLoggingCounter++;
  }
 }
 void FlightRecorder::CsumLog(uint64_t hex)
 {
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLoggingCounter++;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLogVector.push_back(hex);
    CsumLoggingCounter++;
  }
  if(LoggingMode == LoggingModeVerify) {
    if(CsumLoggingCounter < CsumLogVector.size()) {
      uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ;
      if ( hex != hexref ) {
        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		CsumLoggingCounter,hex, hexref);
 	fflush(stderr);
 	if(!ContinueOnFail) assert(0); // Force takedown of job
 	ErrorCounter++;
      } else {
 	if ( PrintEntireLog ) { 
 	  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
 		   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 	}
      }
    }  
    if ( CsumLogVector.size()==CsumLoggingCounter ) {
      std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
    }
    CsumLoggingCounter++;
  }
 }
 void FlightRecorder::ReductionLog(double local,double global)
 {
  uint64_t hex_l = * ( (uint64_t *)&local );
  uint64_t hex_g = * ( (uint64_t *)&global );
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
    ReductionLoggingCounter++;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
    ReductionLogVector.push_back(global);
    ReductionLoggingCounter++;
  }
  if(LoggingMode == LoggingModeVerify) {
    if(ReductionLoggingCounter < ReductionLogVector.size()){
      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		ReductionLoggingCounter,ReductionLogVector.size(),
 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
 	if ( !ContinueOnFail ) assert(0);
 	ErrorCounter++;
      } else {
 	if ( PrintEntireLog ) { 
 	  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
 	}
      }
    }
    if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
    }
    ReductionLoggingCounter++;
  }
 }
 void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 {
  if ( ChecksumCommsSend ){
  uint64_t *ubuf = (uint64_t *)buf;
  if(LoggingMode == LoggingModeNone) return;
 #ifdef GRID_SYCL
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
    XmitLoggingCounter++;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
    XmitLogVector.push_back(_xor);
    XmitLoggingCounter++;
  }
  if(LoggingMode == LoggingModeVerify) {
    if(XmitLoggingCounter < XmitLogVector.size()){
      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		XmitLoggingCounter,XmitLogVector.size(),
 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
 	if ( !ContinueOnFail ) assert(0);
 	ErrorCounter++;
      } else {
 	if ( PrintEntireLog ) { 
 	  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl;
 	}
      }
    }
    if ( XmitLogVector.size()==XmitLoggingCounter ) {
      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
    }
    XmitLoggingCounter++;
  }
 #endif
  } else {
    uint64_t word = 1;
    deviceVector<uint64_t> dev(1);
    acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
    acceleratorCopySynchronise();
    MPI_Barrier(MPI_COMM_WORLD);
  }
 }
 void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 {
  if ( ChecksumComms ){
  uint64_t *ubuf = (uint64_t *)buf;
  if(LoggingMode == LoggingModeNone) return;
 #ifdef GRID_SYCL
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
    RecvLoggingCounter++;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
    RecvLogVector.push_back(_xor);
    RecvLoggingCounter++;
  }
  if(LoggingMode == LoggingModeVerify) {
    if(RecvLoggingCounter < RecvLogVector.size()){
      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		RecvLoggingCounter,RecvLogVector.size(),
 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
 	if ( !ContinueOnFail ) assert(0);
 	ErrorCounter++;
      } else {
 	if ( PrintEntireLog ) { 
 	  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl;
 	}
      }
    }
    if ( RecvLogVector.size()==RecvLoggingCounter ) {
      std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
    }
    RecvLoggingCounter++;
  }
 #endif
  }
 }
 NAMESPACE_END(Grid);
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@ -0,0 +1,43 @@
 #pragma once
 NAMESPACE_BEGIN(Grid);
 class FlightRecorder {
 public:
  enum LoggingMode_t {
    LoggingModeNone,
    LoggingModePrint,
    LoggingModeRecord,
    LoggingModeVerify
  };
  static int                   LoggingMode;
  static uint64_t              ErrorCounter;
  static int32_t               XmitLoggingCounter;
  static int32_t               RecvLoggingCounter;
  static int32_t               CsumLoggingCounter;
  static int32_t               NormLoggingCounter;
  static int32_t               ReductionLoggingCounter;
  static std::vector<uint64_t> XmitLogVector;
  static std::vector<uint64_t> RecvLogVector;
  static std::vector<uint64_t> CsumLogVector;
  static std::vector<double>   NormLogVector;
  static std::vector<double>   ReductionLogVector;
  static int ContinueOnFail;
  static int PrintEntireLog;
  static int ChecksumComms;
  static int ChecksumCommsSend;
  static void SetLoggingModePrint(void);
  static void SetLoggingModeRecord(void);
  static void SetLoggingModeVerify(void);
  static void SetLoggingMode(LoggingMode_t mode);
  static void NormLog(double value);
  static void CsumLog(uint64_t csum);
  static void ReductionLog(double lcl, double glbl);
  static void Truncate(void);
  static void ResetCounters(void);
  static uint64_t ErrorCount(void);
  static void xmitLog(void *,uint64_t bytes);
  static void recvLog(void *,uint64_t bytes,int rank);
 };
 NAMESPACE_END(Grid);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts)
 }
 #endif
 #ifndef HOST_NAME_MAX
 #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #endif
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////
@ -90,7 +94,12 @@ int GridThread::_threads =1;
 int GridThread::_hyperthreads=1;
 int GridThread::_cores=1;
 char hostname[HOST_NAME_MAX+1];
 char *GridHostname(void)
 {
  return hostname;
 }
 const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;};
 const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;};
 const Coordinate GridDefaultSimd(int dims,int nsimd)
@ -393,6 +402,8 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;
  gethostname(hostname, HOST_NAME_MAX+1);
  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
  /////////////////////////////////////////////////////////
  // Reporting
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 void Grid_init(int *argc,char ***argv);
 void Grid_finalize(void);
 char * GridHostname(void);
 // internal, controled with --handle
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_debug_handler_init(void);
@ -68,5 +70,6 @@ void GridParseLayout(char **argv,int argc,
 void printHash(void);
 NAMESPACE_END(Grid);
--- a/Grid/util/Util.h
+++ b/Grid/util/Util.h
@ -1,6 +1,6 @@
-#ifndef GRID_UTIL_H
+#pragma once
 #define GRID_UTIL_H
 #include <Grid/util/Coordinate.h>
 #include <Grid/util/Lexicographic.h>
 #include <Grid/util/Init.h>
-#endif
+#include <Grid/util/FlightRecorder.h>
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@ -65,7 +65,7 @@ struct time_statistics{
 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
-            <<"bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)"<<std::endl;
+            <<"bytes\t MB/s uni  \t\t MB/s bidi "<<std::endl;
 };
 struct controls {
@ -180,10 +180,9 @@ public:
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
-		     <<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
+		     <<xbytes/timestat.mean
-		     <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+		     << "\t\t"
-		     << "\t\t"<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+		     << bidibytes/timestat.mean<< std::endl;
 		     << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
 	  }
 	}
@ -220,7 +219,7 @@ public:
    uint64_t NN;
-  uint64_t lmax=32;
+  uint64_t lmax=40;
 #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
@ -256,7 +255,7 @@ public:
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 	       << "\t\t"<< bytes/time/NN <<std::endl;
-      fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN/1000.);
+      fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
    }
    fprintf(FP,"\n\n");
@ -268,64 +267,61 @@ public:
    //int nbasis, int nrhs, int coarseVol
    int  basis[] = { 16,32,64 };
    int  rhs[]   = { 8,16,32 };
-    int  vols[]  = { 4*4*4*4, 8*8*8*8, 8*8*16*16 };
+    int  vol  = 4*4*4*4;
    GridBLAS blas;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / node (coarse mrhs)"<<std::endl;
+    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
    for(int v=0;v<3;v++){
      int M=basis[b];
      int N=rhs[r];
      int K=basis[b];
-      int BATCH=vols[v];
+      int BATCH=vol;
-      double p=blas.benchmark(M,rhs[r],vols[v],1);
+      double p=blas.benchmark(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
-    }}}
+    }}
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / node (block project)"<<std::endl;
+    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
    for(int v=0;v<2;v++){
      int M=basis[b];
      int N=rhs[r];
-      int K=vols[2];
+      int K=vol;
-      int BATCH=vols[v];
+      int BATCH=vol;
-      double p=blas.benchmark(M,rhs[r],vols[v],1);
+      double p=blas.benchmark(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
-    }}}
+    }}
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / node (block promote)"<<std::endl;
+    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
    for(int v=0;v<2;v++){
      int M=rhs[r];
-      int N=vols[2];
+      int N=vol;
      int K=basis[b];
-      int BATCH=vols[v];
+      int BATCH=vol;
-      double p=blas.benchmark(M,rhs[r],vols[v],1);
+      double p=blas.benchmark(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
-    }}}
+    }}
    fprintf(FP,"\n\n\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  };
@ -458,11 +454,17 @@ public:
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
-      const int num_cases = 1;
+#ifdef AVX512
      const int num_cases = 3;
 #else 
      const int num_cases = 2;
 #endif      
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
 	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
      }; 
      for(int c=0;c<num_cases;c++) {
@ -473,6 +475,10 @@ public:
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -618,11 +624,13 @@ public:
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
-      const int num_cases = 1;
+      const int num_cases = 2;
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
      }; 
      for(int c=0;c<num_cases;c++) {
@ -851,11 +859,8 @@ int main (int argc, char ** argv)
  }
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-#endif
+
  Benchmark::Decomposition();
  int do_su4=0;
@ -873,10 +878,10 @@ int main (int argc, char ** argv)
  int Ls=1;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Clover dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
-    clover.push_back(Benchmark::Clover(L_list[l]));
+    clover.push_back(Benchmark::DWF(1,L_list[l]));
  }
  Ls=12;
@ -914,7 +919,7 @@ int main (int argc, char ** argv)
  }
  if ( do_blas ) {
-#if defined(GRID_CUDA) || defined(GRID_HIP)    
+#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -942,7 +947,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
    fprintf(FP,"Per node summary table\n");
    fprintf(FP,"\n");
-    fprintf(FP,"L , Wilson, DWF4, Staggered\n");
+    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
    fprintf(FP,"\n");
    for(int l=0;l<L_list.size();l++){
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
--- a/configure.ac
+++ b/configure.ac
@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
 esac
 ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
-AC_ARG_ENABLE([accelerator-cshift],
+AC_ARG_ENABLE([accelerator-aware-mpi],
-    [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
+    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
-    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
+    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
-AC_ARG_ENABLE([ucx-buggy],
+case ${ac_ACCELERATOR_AWARE_MPI} in
    [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
 case ${ac_UCXBUGGY} in
    yes)
-    ac_ACC_CSHIFT=no;;
+      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
-    *);;
+      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
 esac
 case ${ac_ACC_CSHIFT} in
    yes)
      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
    *);;
 esac
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 export FI_CXI_CQ_FILL_PERCENT=10
 export FI_CXI_DEFAULT_CQ_SIZE=262144
 #export FI_CXI_DEFAULT_CQ_SIZE=131072
 #export FI_CXI_CQ_FILL_PERCENT=20
 # 12 ppn, 32 nodes, 384 ranks
 #
@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf.small
+$CMD | tee 1024node.dwf.small.cq
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf
+$CMD | tee 1024node.dwf.cq
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@ -17,6 +17,7 @@ source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-$CMD 
+#$CMD 
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 #$CMD 
 CMD="mpiexec -np 1 -ppn 1  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 1 -ppn 1  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,16 +1,16 @@
-TOOLS=$HOME/tools
+
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=no\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
--- a/systems/Aurora/sourceme-sunspot-deterministic.sh
+++ b/systems/Aurora/sourceme-sunspot-deterministic.sh
@ -0,0 +1,2 @@
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@ -3,10 +3,24 @@
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
 export FI_CXI_DEFAULT_CQ_SIZE=131072
 export FI_CXI_CQ_FILL_PERCENT=20
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
 #
 # -ftarget-register-alloc-mode=pvc:default 
 # -ftarget-register-alloc-mode=pvc:small
 # -ftarget-register-alloc-mode=pvc:large
 # -ftarget-register-alloc-mode=pvc:auto
 #
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
- 
+
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Aurora/tests/repro128.pbs
+++ b/systems/Aurora/tests/repro128.pbs
@ -0,0 +1,41 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=128
 #PBS -l walltime=02:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 16 nodes, 192 ranks
 # 12 ppn, 128 nodes, 1536 ranks
 CMD="mpiexec -np 1536 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
 $CMD 
--- a/systems/Aurora/tests/repro16.pbs
+++ b/systems/Aurora/tests/repro16.pbs
@ -0,0 +1,61 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -l select=16:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=01:00:00
 #PBS -N dwf
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 #source ../sourceme.sh
 cat $PBS_NODEFILE
 #export MPICH_COLL_SYNC=1
 #export MPICH_ENV_DISPLAY=1
 export MPICH_
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 #export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 DIR=repro.$PBS_JOBID
 mkdir $DIR
 cd $DIR
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ../gpu_tile_compact.sh \
 	     ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative"
 #--comms-overlap
 $CMD 
 grep Oops Grid.stderr.* > failures.$PBS_JOBID
 rm core.*
--- a/systems/Aurora/tests/repro1gpu.pbs
+++ b/systems/Aurora/tests/repro1gpu.pbs
@ -0,0 +1,82 @@
 #!/bin/bash
 #PBS -l select=16:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N repro1gpu
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 cd $PBS_O_WORKDIR
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
 echo $NN nodes in node file
 for n in `eval echo {1..$NN}`
 do
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE
 for g in {0..11}
 do
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export numa=${NUMA_MAP[$g]}
 export gpu_id=${GPU_MAP[$g]}
 export tile_id=${TILE_MAP[$g]}
 export gpu=$gpu_id.$tile_id
 cd $PBS_O_WORKDIR
 DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
 mkdir -p $DIR
 cd $DIR
 echo $THIS_NODE > nodefile
 echo $gpu > gpu
 export ZE_AFFINITY_MASK=$gpu
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \
 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
 echo $CMD
 $CMD &
 done
 done
 wait
--- a/systems/Aurora/tests/reproN.pbs
+++ b/systems/Aurora/tests/reproN.pbs
@ -0,0 +1,98 @@
 #!/bin/bash
 #PBS -l select=32:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N reproN
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export GRID_PRINT_ENTIRE_LOG=0
 export GRID_CHECKSUM_RECV_BUF=0
 export GRID_CHECKSUM_SEND_BUF=0
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 cd $PBS_O_WORKDIR
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
 echo $NN nodes in node file
 for n in `eval echo {1..$NN}`
 do
 cd $PBS_O_WORKDIR
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE
 DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 mkdir -p $DIR
 cd $DIR
 echo $THIS_NODE > nodefile
 #CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 #	     ../../gpu_tile_compact.sh \
 #	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 #		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 	     ../../gpu_tile_compact.sh \
 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 echo $CMD > command-line
 env > environment
 $CMD &
 done
 # Suspicious wait is allowing jobs to collide and knock out
 #wait
 sleep 6500
 for n in ` eval echo {1..$NN} `
 do
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 cd $DIR
 grep Oops Grid.stderr.* > failures.$PBS_JOBID
 rm core.*
 done
--- a/systems/Aurora/tests/solver/stag16.pbs
+++ b/systems/Aurora/tests/solver/stag16.pbs
@ -0,0 +1,40 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=16
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 16 nodes, 192 ranks
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
 	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
 $CMD 
--- a/systems/Booster/benchmarks/Benchmark_usqcd.csv
+++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv
@ -0,0 +1,70 @@
 Memory Bandwidth
 Bytes, GB/s per node
 3145728, 225.900365
 50331648, 2858.859504
 254803968, 4145.556367
 805306368, 4905.772480
 1966080000, 4978.312557
 GEMM
 M, N, K, BATCH, GF/s per rank
 16, 8, 16, 256, 1.713639
 16, 16, 16, 256, 288.268316
 16, 32, 16, 256, 597.053950
 32, 8, 32, 256, 557.382591
 32, 16, 32, 256, 1100.145311
 32, 32, 32, 256, 1885.080449
 64, 8, 64, 256, 1725.163599
 64, 16, 64, 256, 3389.336566
 64, 32, 64, 256, 4168.252422
 16, 8, 256, 256, 1326.262134
 16, 16, 256, 256, 2318.095475
 16, 32, 256, 256, 3555.436503
 32, 8, 256, 256, 1920.139170
 32, 16, 256, 256, 3486.174753
 32, 32, 256, 256, 5320.821724
 64, 8, 256, 256, 2539.597502
 64, 16, 256, 256, 5003.456775
 64, 32, 256, 256, 7837.531562
 8, 256, 16, 256, 1427.848170
 16, 256, 16, 256, 2222.147815
 32, 256, 16, 256, 2877.121715
 8, 256, 32, 256, 1922.890086
 16, 256, 32, 256, 3199.469082
 32, 256, 32, 256, 4845.405343
 8, 256, 64, 256, 2639.483343
 16, 256, 64, 256, 5012.800299
 32, 256, 64, 256, 7216.006882
 Communications
 Packet bytes, direction, GB/s per node
 4718592, 2, 206.570734
 4718592, 3, 207.501847
 4718592, 6, 189.730277
 4718592, 7, 204.301218
 15925248, 2, 307.882997
 15925248, 3, 287.901076
 15925248, 6, 295.603109
 15925248, 7, 300.682033
 37748736, 2, 331.740364
 37748736, 3, 338.610627
 37748736, 6, 332.580657
 37748736, 7, 336.336579
 Per node summary table
 L , Wilson, DWF4, Staggered, GF/s per node
 8 , 16, 1165, 10
 12 , 473, 4901, 163
 16 , 1436, 8464, 442
 24 , 4133, 10139, 1530
 32 , 5726, 11487, 2518
--- a/systems/Booster/config-command
+++ b/systems/Booster/config-command
@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
    --enable-gen-simd-width=64 \
    --enable-shm=nvlink \
    --enable-accelerator=cuda \
    --disable-gparity \
    --disable-fermion-reps \
    --with-lime=$LIME \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"
--- a/systems/Booster/sourceme.sh
+++ b/systems/Booster/sourceme.sh
@ -1,5 +1,5 @@
-module load GCC/9.3.0       
+module load GCC
-module load  GMP/6.2.0   
+module load GMP
-module load MPFR/4.1.0     
+module load MPFR
-module load OpenMPI/4.1.0rc1  
+module load OpenMPI
-module load CUDA/11.3
+module load CUDA
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -16,7 +16,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
- LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
+ LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64  -lhipblas -lrocblas"
--- a/systems/PVC-OEM/setup.sh
+++ b/systems/PVC-OEM/setup.sh
@ -1,3 +1,5 @@
 export https_proxy=http://proxy-chain.intel.com:911
 module load intel-release
 module load intel/mpich
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Sunspot/config-command
+++ b/systems/Sunspot/config-command
@ -1,4 +1,4 @@
-TOOLS=$HOME/tools
+
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
--- a/systems/Sunspot/sourceme.sh
+++ b/systems/Sunspot/sourceme.sh
@ -0,0 +1,2 @@
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
--- a/systems/Sunspot/tests/repro1gpu.pbs
+++ b/systems/Sunspot/tests/repro1gpu.pbs
@ -0,0 +1,81 @@
 #!/bin/bash
 #PBS -l select=16:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N repro1gpu
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 cd $PBS_O_WORKDIR
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
 echo $NN nodes in node file
 for n in `eval echo {1..$NN}`
 do
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE
 for g in {0..11}
 do
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export numa=${NUMA_MAP[$g]}
 export gpu_id=${GPU_MAP[$g]}
 export tile_id=${TILE_MAP[$g]}
 export gpu=$gpu_id.$tile_id
 cd $PBS_O_WORKDIR
 DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
 mkdir -p $DIR
 cd $DIR
 echo $THIS_NODE > nodefile
 echo $gpu > gpu
 export ZE_AFFINITY_MASK=$gpu
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \
 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
 echo $CMD
 $CMD &
 done
 done
 wait
--- a/systems/Sunspot/tests/reproN.pbs
+++ b/systems/Sunspot/tests/reproN.pbs
@ -0,0 +1,97 @@
 #!/bin/bash
 #PBS -l select=32:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N reproN
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 export GRID_PRINT_ENTIRE_LOG=0
 export GRID_CHECKSUM_RECV_BUF=1
 export GRID_CHECKSUM_SEND_BUF=0
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 cd $PBS_O_WORKDIR
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
 echo $NN nodes in node file
 for n in `eval echo {1..$NN}`
 do
 cd $PBS_O_WORKDIR
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE
 DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 mkdir -p $DIR
 cd $DIR
 echo $THIS_NODE > nodefile
 #CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 #	     ../../gpu_tile_compact.sh \
 #	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 #		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 	     ../../gpu_tile_compact.sh \
 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 echo $CMD > command-line
 env > environment
 $CMD &
 done
 # Suspicious wait is allowing jobs to collide and knock out
 #wait
 sleep 6500
 for n in ` eval echo {1..$NN} `
 do
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
 cd $DIR
 grep Oops Grid.stderr.* > failures.$PBS_JOBID
 rm core.*
 done
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@ -30,27 +30,60 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
-template<class d>
+#ifndef HOST_NAME_MAX
-struct scal {
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
-  d internal;
+#endif
 NAMESPACE_BEGIN(Grid);
 template<class Matrix,class Field>
  class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> {
 public:
    Matrix &_Mat;
    SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){};
    virtual  void Mpc      (const Field &in, Field &out) {
      Field tmp(in.Grid());
      tmp.Checkerboard() = !in.Checkerboard();
      //      std::cout <<" Mpc starting"<<std::endl;
      RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
      _Mat.Meooe(in,tmp);
      nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl;
      _Mat.MooeeInv(tmp,out);
      nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
      _Mat.Meooe(out,tmp);
      nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl;
      _Mat.Mooee(in,out);
      nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl;
      axpy(out,-1.0,tmp,out);
    }
    virtual void MpcDag   (const Field &in, Field &out){
      Field tmp(in.Grid());
      //      std::cout <<" MpcDag starting"<<std::endl;
      RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
      _Mat.MeooeDag(in,tmp);
      _Mat.MooeeInvDag(tmp,out);
      nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
      _Mat.MeooeDag(out,tmp);
      nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl;
      _Mat.MooeeDag(in,out);
      nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl;
      axpy(out,-1.0,tmp,out);
    }
 };
-  Gamma::Algebra Gmu [] = {
+NAMESPACE_END(Grid);
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 int main (int argc, char ** argv)
 {
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
  std::string host(hostname);
  Grid_init(&argc,&argv);
  const int Ls=12;
  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
  { 
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -89,10 +122,17 @@ int main (int argc, char ** argv)
  result_o_2.Checkerboard() = Odd;
  result_o_2 = Zero();
-  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
+  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
-  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
+  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
  int nsecs=600;
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
    GridCmdOptionInt(arg,nsecs);
  }
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
  double MdagMsiteflops = 1452; // Mobius (real coeffs)
@ -101,7 +141,26 @@ int main (int argc, char ** argv)
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
-  for(int i=0;i<10;i++){
+
  time_t start = time(NULL);
  FlightRecorder::ContinueOnFail = 0;
  FlightRecorder::PrintEntireLog = 0;
  FlightRecorder::ChecksumComms  = 1;
  FlightRecorder::ChecksumCommsSend=0;
  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
  if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s);
  if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s);
  int iter=0;
  do {
    if ( iter == 0 ) {
      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
    } else {
      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
    }
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
@ -111,10 +170,24 @@ int main (int argc, char ** argv)
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
+    std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
-  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
+
    assert(FlightRecorder::ErrorCount()==0);
    std::cout << " FlightRecorder is OK! "<<std::endl;
    iter ++;
  } while (time(NULL) < (start + nsecs/10) );
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<1;i++){
+  int i=0;
  do { 
    if ( i == 0 ) {
      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
    } else {
      FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
    }
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
@ -122,46 +195,20 @@ int main (int argc, char ** argv)
    iters = CG.IterationsToComplete;
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
-    
+
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
+    std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
-  
+    assert(FlightRecorder::ErrorCount()==0);
-  //  MemoryManager::Print();
+    std::cout << " FlightRecorder is OK! "<<std::endl;
    i++;
  } while (time(NULL) < (start + nsecs) );
  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
-
+  assert(diff < 1e-4);
  #ifdef HAVE_LIME
  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
  std::string file1("./Propagator1");
  emptyUserRecord record;
  uint32_t nersc_csum;
  uint32_t scidac_csuma;
  uint32_t scidac_csumb;
  typedef SpinColourVectorD   FermionD;
  typedef vSpinColourVectorD vFermionD;
  BinarySimpleMunger<FermionD,FermionD> munge;
  std::string format = getFormatString<vFermionD>();
  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
 						   nersc_csum,scidac_csuma,scidac_csumb);
  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
 						   nersc_csum,scidac_csuma,scidac_csumb);
  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
  }
  #endif
  }
  MemoryManager::Print();
  Grid_finalize();
 }
Author	SHA1	Message	Date
Peter Boyle	da59379612	Large reg file for double	2024-03-26 17:03:20 +00:00
Peter Boyle	3ef2a41518	ifdef guard ommitted	2024-03-26 14:50:32 +00:00
Peter Boyle	aa96f420c6	Acclerator ware MPI guard on the Unix domain sockets	2024-03-26 14:41:25 +00:00
Peter Boyle	49e9e4ed0e	Fences	2024-03-26 14:14:06 +00:00
Peter Boyle	f7b8163016	Deterministic MPI reduce options	2024-03-26 14:11:40 +00:00
Peter Boyle	93769eacd3	Updated configure for bounce through host	2024-03-26 14:10:24 +00:00
Peter Boyle	59b0cc11df	REduce the time in single	2024-03-26 00:42:40 +00:00
Peter Boyle	f32c275376	Updated config options for MPI not being aware of GPU	2024-03-26 00:42:00 +00:00
Peter Boyle	5404fc66ab	Merge needs a fence on SYCL	2024-03-26 00:38:41 +00:00
Peter Boyle	1f53458af8	Options to bounce through a host buffer if --disable-accelerator-aware-mpi	2024-03-26 00:37:19 +00:00
Peter Boyle	434c3e7f1d	We have a choice of GET or PUT across NVlink	2024-03-25 14:32:44 +00:00
Peter Boyle	500b119f3d	Deterministic MPI	2024-03-22 15:55:23 +00:00
Peter Boyle	4b87259c1b	New config command for sunspot	2024-03-22 15:43:49 +00:00
Peter Boyle	503dec34ef	This appears working now on Sunspot	2024-03-22 15:43:30 +00:00
Peter Boyle	d1e9fe50d2	Xor csum for repro testing	2024-03-22 15:42:57 +00:00
Peter Boyle	d01e5fa838	Improved FlightRecorder	2024-03-22 15:42:32 +00:00
Peter Boyle	a477c25e8c	Sunspot repro tests	2024-03-22 15:42:11 +00:00
Peter Boyle	1bd20cd9e8	FlightRecorder	2024-03-22 15:40:01 +00:00
Peter Boyle	e49e95b037	Upgrade of the Britney test with flight recorder and fast xor checksum	2024-03-22 15:39:27 +00:00
Peter Boyle	6f59fed563	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:32:32 +00:00
Peter Boyle	60b7f6c99d	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:32:26 +00:00
Peter Boyle	b92dfcc8d3	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:30:27 +00:00
Peter Boyle	f6fd6dd053	Flight recorder, resurrecting the "world famous" Britney test	2024-03-22 15:30:01 +00:00
Peter Boyle	79ad567dd5	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-19 15:43:42 +00:00
Peter Boyle	fab1efb48c	More britney logging improvements	2024-03-19 14:36:21 +00:00
Peter Boyle	660eb76d93	FFTW from OneAPI	2024-03-19 14:28:33 +00:00
Peter Boyle	62e7bf024a	Updated flight logging for Britney test	2024-03-12 20:10:04 +00:00
Peter Boyle	95f3d69cf9	Extra hardware test hook	2024-03-12 20:09:37 +00:00
Peter Boyle	89c0519f83	Repro test	2024-03-12 16:11:33 +00:00
Peter Boyle	2704b82084	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-12 15:16:24 +00:00
Peter Boyle	cf8632bbac	Britney test option	2024-03-12 15:15:35 +00:00
Peter Boyle	d224297972	PBS scripts	2024-03-12 15:15:16 +00:00
Peter Boyle	a4d11a630f	Merge pull request #458 from paboyle/fix/HOST_NAME_MAX fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined	2024-03-07 07:50:25 -05:00
Antonin Portelli	2b4399f8b1	more HOST_NAME_MAX fix	2024-03-07 15:26:01 +09:00
Antonin Portelli	f17b8de907	fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined	2024-03-07 15:22:08 +09:00
Peter Boyle	7e5bd46dd3	Booster update	2024-03-06 19:03:45 +01:00
Peter Boyle	228bbb9d81	Benchmark results	2024-03-06 19:03:35 +01:00
Peter Boyle	b812a7b4c6	Staggered launch script	2024-03-06 01:32:40 +00:00
Peter Boyle	891a366f73	Repro CG script	2024-03-06 01:22:55 +00:00
Peter Boyle	10116b3be8	Force device copyable and tell SYCL to shut it.	2024-03-06 01:13:27 +00:00
Peter Boyle	a46a0f0882	force device copyable and don't take crap from SYCL	2024-03-06 01:12:49 +00:00
Peter Boyle	a26a8a38f4	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-06 00:05:00 +00:00
Peter Boyle	7435315d50	More blasted shell variables	2024-03-06 00:03:59 +00:00
Peter Boyle	9b5f741e85	Reproducing CG can be more useful now	2024-03-06 00:03:16 +00:00
Peter Boyle	517822fdd2	SPR HBM benchmarking right and also PVC batched GEMM	2024-03-06 00:02:27 +00:00
Peter Boyle	1b93a9be88	Print out the hostname	2024-03-06 00:01:58 +00:00
Peter Boyle	783a66b348	Deterministic reduction please	2024-03-06 00:01:37 +00:00
Peter Boyle	976c3e9b59	Hack for flight logging CG inner products. Can be made to work, but could put in some more serious infrastructure for repro testing and blame attribution (Britney test) if necessary	2024-03-05 23:59:57 +00:00
Peter Boyle	f8ca971dae	Use of a bare PRECISION macro is not namespace safe and collides with SYCL	2024-03-05 23:59:13 +00:00
Peter Boyle	21bc8c24df	OneMKL batched blas starting	2024-03-05 23:58:20 +00:00
Peter Boyle	30228214f7	SYCL conflict with Eigen	2024-03-05 23:56:10 +00:00
Peter Boyle	2ae980ae43	Update sourceme.sh	2024-03-05 13:39:18 -05:00
Peter Boyle	6153dec2e4	Update setup.sh	2024-03-05 13:38:32 -05:00
Peter Boyle	c805f86343	USQCD benchmark	2024-03-01 00:05:04 -05:00
		`@ -0,0 +1,2 @@`
							`module load oneapi/eng-compiler/2023.05.15.003`
							`module load mpich/51.2/icc-all-deterministic-pmix-gpu`