From 6153dec2e40dc711ce2eccd49b1f61e8704d1d69 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Tue, 5 Mar 2024 13:38:32 -0500
Subject: [PATCH 01/17] Update setup.sh

---
 systems/PVC-OEM/setup.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/systems/PVC-OEM/setup.sh b/systems/PVC-OEM/setup.sh
index 3b8188f0..0e780ef4 100644
--- a/systems/PVC-OEM/setup.sh
+++ b/systems/PVC-OEM/setup.sh
@@ -1,3 +1,5 @@
 export https_proxy=http://proxy-chain.intel.com:911
 module load intel-release
 module load intel/mpich
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

From 2ae980ae439b92285341ec5777c9c4b5ec293547 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Tue, 5 Mar 2024 13:39:18 -0500
Subject: [PATCH 02/17] Update sourceme.sh

---
 systems/Aurora/sourceme.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh
index 7a2b3815..60abed41 100644
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -9,4 +9,5 @@ export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
- 
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

From 30228214f737c002ca2c5636b73bc23af59a88ae Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 5 Mar 2024 23:56:10 +0000
Subject: [PATCH 03/17] SYCL conflict with Eigen

---
 Grid/Grid_Eigen_Dense.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h
index bdd39a65..8bd1d113 100644
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,7 +34,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-//#undef EIGEN_USE_SYCL
+#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif
 

From 21bc8c24df800a480c3d36c5a135f9512658631f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 5 Mar 2024 23:58:20 +0000
Subject: [PATCH 04/17] OneMKL batched blas starting

---
 Grid/algorithms/blas/BatchedBlas.h | 40 ++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index 2924350d..5ab46333 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -34,9 +34,14 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_SYCL
-#error // need oneMKL version
+#include <oneapi/mkl.hpp>
+#endif
+#if 0
+#define GRID_ONE_MKL
+#endif
+#ifdef GRID_ONE_MKL
+#include <oneapi/mkl.hpp>
 #endif
-
 ///////////////////////////////////////////////////////////////////////	  
 // Need to rearrange lattice data to be in the right format for a
 // batched multiply. Might as well make these static, dense packed
@@ -49,9 +54,12 @@ NAMESPACE_BEGIN(Grid);
   typedef cudablasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef int32_t gridblasHandle_t;
+  typedef cl::sycl::queue *gridblasHandle_t;
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+#ifdef GRID_ONE_MKL
+  typedef cl::sycl::queue *gridblasHandle_t;
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
   typedef int32_t gridblasHandle_t;
 #endif
 
@@ -76,6 +84,12 @@ public:
       hipblasCreate(&gridblasHandle);
 #endif
 #ifdef GRID_SYCL
+      gridblasHandle = theGridAccelerator;
+#endif
+#ifdef GRID_ONE_MKL
+      cl::sycl::cpu_selector selector;
+      cl::sycl::device selectedDevice { selector };
+      gridblasHandle =new sycl::queue (selectedDevice);
 #endif
       gridblasInit=1;
     }
@@ -110,6 +124,9 @@ public:
 #endif
 #ifdef GRID_SYCL
     accelerator_barrier();
+#endif
+#ifdef GRID_ONE_MKL
+    gridblasHandle->wait();
 #endif
   }
   
@@ -644,10 +661,19 @@ public:
 			      (cuDoubleComplex *) Cmn, ldc, sdc,
 			      batchCount);
 #endif
-#ifdef GRID_SYCL
-     #warning "oneMKL implementation not made "
+#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
+    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						oneapi::mkl::transpose::N,
+						oneapi::mkl::transpose::N,
+						m,n,k,
+						alpha,
+						(const ComplexD *)Amk,lda,sda,
+						(const ComplexD *)Bkn,ldb,sdb,
+						beta,
+						(ComplexD *)Cmn,ldc,sdc,
+						batchCount);
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
      // Need a default/reference implementation
      for (int p = 0; p < batchCount; ++p) {
        for (int mm = 0; mm < m; ++mm) {

From f8ca971daedc087fbb1f554705d27d6af44eef6f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 5 Mar 2024 23:59:13 +0000
Subject: [PATCH 05/17] Use of a bare PRECISION macro is not namespace safe and
 collides with SYCL

---
 Grid/algorithms/approx/Zolotarev.cc | 96 ++++++++++++++---------------
 Grid/algorithms/approx/Zolotarev.h  | 11 ++--
 2 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/Grid/algorithms/approx/Zolotarev.cc b/Grid/algorithms/approx/Zolotarev.cc
index c2efd41c..47779eae 100644
--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
  * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
  * type = 1 for the approximation which is infinite at x = 0. */
 
-zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
   INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
     l, invlambda, xi, xisq, *tv, s, opl;
   int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
   construct_partfrac(d);
   construct_contfrac(d);
 
-  /* Converting everything to PRECISION for external use only */
+  /* Converting everything to ZOLO_PRECISION for external use only */
 
   zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
   zd -> n = d -> n;
   zd -> type = d -> type;
   zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
   zd -> deg_num = d -> deg_num;
   zd -> deg_denom = d -> deg_denom;
 
-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
   free(d -> a);
 
-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
   free(d -> ap);
 
-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
   free(d -> alpha);
 
-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
   free(d -> beta);
 
-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
   free(d -> gamma);
 
   free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }
 
 
-zolotarev_data* higham(PRECISION epsilon, int n) {
+zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
   INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
   int m, czero;
   zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
   /* Converting everything to PRECISION for external use only */
 
   zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
   zd -> n = d -> n;
   zd -> type = d -> type;
   zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
   zd -> deg_num = d -> deg_num;
   zd -> deg_denom = d -> deg_denom;
 
-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
   free(d -> a);
 
-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
   free(d -> ap);
 
-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
   free(d -> alpha);
 
-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
   free(d -> beta);
 
-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
   free(d -> gamma);
 
   free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST
 
 #undef ZERO
-#define ZERO ((PRECISION) 0)
+#define ZERO ((ZOLO_PRECISION) 0)
 #undef ONE
-#define ONE ((PRECISION) 1)
+#define ONE ((ZOLO_PRECISION) 1)
 #undef TWO
-#define TWO ((PRECISION) 2)
+#define TWO ((ZOLO_PRECISION) 2)
 
 /* Evaluate the rational approximation R(x) using the factored form */
 
-static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
   int m;
-  PRECISION R;
+  ZOLO_PRECISION R;
 
   if (rdata -> type == 0) {
     R = rdata -> A * x;
@@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
 
 /* Evaluate the rational approximation R(x) using the partial fraction form */
 
-static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
   int m;
-  PRECISION R = rdata -> alpha[rdata -> da - 1];
+  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
   for (m = 0; m < rdata -> dd; m++)
     R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
   if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
  * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
  * but with signalling overflow you will get an error message. */
 
-static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
   int m;
-  PRECISION R = rdata -> beta[0] * x;
+  ZOLO_PRECISION R = rdata -> beta[0] * x;
   for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
   return R;
 }    
 
 /* Evaluate the rational approximation R(x) using Cayley form */
 
-static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
   int m;
-  PRECISION T;
+  ZOLO_PRECISION T;
 
   T = rdata -> type == 0 ? ONE : -ONE;
   for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
   int m, n, plotpts = 5000, type = 0;
   float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
   zolotarev_data *rdata;
-  PRECISION y;
+  ZOLO_PRECISION y;
   FILE *plot_function, *plot_error, 
     *plot_partfrac, *plot_contfrac, *plot_cayley;
 
@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
   }
 
   rdata = type == 2 
-    ? higham((PRECISION) eps, n) 
-    : zolotarev((PRECISION) eps, n, type);
+    ? higham((ZOLO_PRECISION) eps, n) 
+    : zolotarev((ZOLO_PRECISION) eps, n, type);
 
   printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tPRECISION = " STRINGIFY(PRECISION)
+	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
     x = 2.4 * (float) m / plotpts - 1.2;
     if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
       /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((PRECISION) x, rdata);
+      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
       fprintf(plot_function, "%g %g\n", x, (float) y);
       fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
       if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
diff --git a/Grid/algorithms/approx/Zolotarev.h b/Grid/algorithms/approx/Zolotarev.h
index 800cf3c7..3c983cd3 100644
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
 
 #ifndef ZOLOTAREV_INTERNAL
-#ifndef PRECISION
-#define PRECISION double
+#ifndef ZOLO_PRECISION
+#define ZOLO_PRECISION double
 #endif
-#define ZPRECISION PRECISION
+#define ZPRECISION ZOLO_PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif
 
@@ -77,8 +77,8 @@ typedef struct {
  * zolotarev_data structure. The arguments must satisfy the constraints that
  * epsilon > 0, n > 0, and type = 0 or 1. */
 
-ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif
 
@@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
+

From 976c3e9b59a10139611886b1cf97e40e3470cbfd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 5 Mar 2024 23:59:57 +0000
Subject: [PATCH 06/17] Hack for flight logging CG inner products. Can be made
 to work, but could put in some more serious infrastructure for repro testing
 and blame attribution (Britney test) if necessary

---
 Grid/lattice/Lattice.h           | 2 +-
 Grid/lattice/Lattice_crc.h       | 4 ++--
 Grid/lattice/Lattice_reduction.h | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index 6343db99..79572949 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
+#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
-#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h
index 142e2349..e31d8441 100644
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
   }
 }
 
-template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
 {
   autoView( buf_v , buf, CpuRead);
   return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 
-#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 66788a4c..3d4c4b03 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -285,6 +285,7 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
   GridBase *grid = left.Grid();
   ComplexD nrm = rankInnerProduct(left,right);
+  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
   grid->GlobalSum(nrm);
   return nrm;
 }

From 783a66b3485169888000cdb735c34617126f47f3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 00:01:37 +0000
Subject: [PATCH 07/17] Deterministic reduction please

---
 Grid/threads/Accelerator.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index f6efdee9..392cba61 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -255,17 +255,13 @@ inline int  acceleratorIsCommunicable(void *ptr)
 #define GRID_SYCL_LEVEL_ZERO_IPC
 
 NAMESPACE_END(Grid);
-#if 0
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>
-#include <level_zero/ze_api.h>
-#include <CL/sycl/backend/level_zero.hpp>
-#else
+
+// Force deterministic reductions
+#define SYCL_REDUCTION_DETERMINISTIC
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
-#endif
 
 NAMESPACE_BEGIN(Grid);
 

From 1b93a9be88cad5d456a84fafdda4104d4213ff82 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 00:01:58 +0000
Subject: [PATCH 08/17] Print out the hostname

---
 Grid/util/Init.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index d013763a..9a0b4376 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -393,6 +393,9 @@ void Grid_init(int *argc,char ***argv)
   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
   std::cout << GridLogMessage << "================================================ "<<std::endl;
 
+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
 
   /////////////////////////////////////////////////////////
   // Reporting

From 517822fdd2c7b12b7c90c2fe9ed557035338c3bb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 00:02:27 +0000
Subject: [PATCH 09/17] SPR HBM benchmarking right and also PVC batched GEMM

---
 benchmarks/Benchmark_usqcd.cc | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc
index eaa78e40..3b729b9e 100644
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -219,7 +219,7 @@ public:
     uint64_t NN;
 
 
-  uint64_t lmax=32;
+  uint64_t lmax=40;
 #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 
     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
@@ -454,11 +454,17 @@ public:
       pickCheckerboard(Even,src_e,src);
       pickCheckerboard(Odd,src_o,src);
 
-      const int num_cases = 1;
+#ifdef AVX512
+      const int num_cases = 3;
+#else 
+      const int num_cases = 2;
+#endif      
       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 
       controls Cases [] = {
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
+	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
+	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
       }; 
 
       for(int c=0;c<num_cases;c++) {
@@ -469,6 +475,10 @@ public:
 
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
@@ -614,11 +624,13 @@ public:
       pickCheckerboard(Even,src_e,src);
       pickCheckerboard(Odd,src_o,src);
     
-      const int num_cases = 1;
+      const int num_cases = 2;
       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
       
       controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
       }; 
 
       for(int c=0;c<num_cases;c++) {
@@ -847,11 +859,8 @@ int main (int argc, char ** argv)
   }
 
   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-#ifdef KNL
-  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
-#else
   LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-#endif
+
   Benchmark::Decomposition();
 
   int do_su4=0;
@@ -910,7 +919,7 @@ int main (int argc, char ** argv)
   }
 
   if ( do_blas ) {
-#if defined(GRID_CUDA) || defined(GRID_HIP)    
+#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

From 9b5f741e8534b262fe4349f7447c637da91b598f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 00:03:16 +0000
Subject: [PATCH 10/17] Reproducing CG can be more useful now

---
 tests/Test_dwf_mixedcg_prec.cc | 111 +++++++++++++++++----------------
 1 file changed, 58 insertions(+), 53 deletions(-)

diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc
index cbc573d1..13cc0bb6 100644
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -30,27 +30,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 
-template<class d>
-struct scal {
-  d internal;
-};
-
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
 int main (int argc, char ** argv)
 {
+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::string host(hostname);
+  
   Grid_init(&argc,&argv);
 
   const int Ls=12;
 
-  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
-
-  { 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -92,7 +81,14 @@ int main (int argc, char ** argv)
   SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
   SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 
-  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
+  int nsecs=600;
+  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
+    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
+    GridCmdOptionInt(arg,nsecs);
+  }
+  
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
+
   MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
   double t1,t2,flops;
   double MdagMsiteflops = 1452; // Mobius (real coeffs)
@@ -101,7 +97,14 @@ int main (int argc, char ** argv)
   std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
   std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
   int iters;
-  for(int i=0;i<10;i++){
+
+  time_t start = time(NULL);
+
+  uint32_t csum, csumref;
+  csumref=0;
+  int iter=0;
+  do {
+    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
     result_o = Zero();
     t1=usecond();
     mCG(src_o,result_o);
@@ -111,10 +114,28 @@ int main (int argc, char ** argv)
     flops+= CGsiteflops*FrbGrid->gSites()*iters;
     std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
     std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
-  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
+
+    csum = crc(result_o);
+
+    if ( csumref == 0 ) {
+      csumref = csum;
+    } else {
+      if ( csum != csumref ) { 
+	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
+	assert(0);
+      } else {
+	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
+      }
+    }
+    iter ++;
+  } while (time(NULL) < (start + nsecs/2) );
+    
+  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<1;i++){
+  csumref=0;
+  int i=0;
+  do { 
+    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
     result_o_2 = Zero();
     t1=usecond();
     CG(HermOpEO,src_o,result_o_2);
@@ -122,46 +143,30 @@ int main (int argc, char ** argv)
     iters = CG.IterationsToComplete;
     flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
     flops+= CGsiteflops*FrbGrid->gSites()*iters;
-    
+
     std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
     std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
-  
-  //  MemoryManager::Print();
+
+    csum = crc(result_o);
+
+    if ( csumref == 0 ) {
+      csumref = csum;
+    } else {
+      if ( csum != csumref ) { 
+	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
+	assert(0);
+      } else {
+	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
+      }
+    }
+    i++;
+  } while (time(NULL) < (start + nsecs) );
 
   LatticeFermionD diff_o(FrbGrid);
   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 
   std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
-
-  #ifdef HAVE_LIME
-  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
+  assert(diff < 1e-4);
   
-  std::string file1("./Propagator1");
-  emptyUserRecord record;
-  uint32_t nersc_csum;
-  uint32_t scidac_csuma;
-  uint32_t scidac_csumb;
-  typedef SpinColourVectorD   FermionD;
-  typedef vSpinColourVectorD vFermionD;
-
-  BinarySimpleMunger<FermionD,FermionD> munge;
-  std::string format = getFormatString<vFermionD>();
-  
-  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-
-  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
-
-  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-
-  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
-  }
-  #endif
-  }
-  
-  MemoryManager::Print();
-
   Grid_finalize();
 }

From 7435315d50163189e9752cdb51fffbf582e26a17 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 00:03:59 +0000
Subject: [PATCH 11/17] More blasted shell variables

---
 systems/Aurora/benchmarks/bench1024.pbs | 10 +++++++---
 systems/Aurora/benchmarks/bench12.pbs   | 17 ++++++++++++++++-
 systems/Aurora/config-command           |  4 ++--
 systems/Aurora/sourceme.sh              | 13 +++++++++++++
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs
index 88f0100a..2e99ae4b 100644
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
+export FI_CXI_CQ_FILL_PERCENT=10
+export FI_CXI_DEFAULT_CQ_SIZE=262144
+#export FI_CXI_DEFAULT_CQ_SIZE=131072
+#export FI_CXI_CQ_FILL_PERCENT=20
 
 # 12 ppn, 32 nodes, 384 ranks
 #
@@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf.small
+$CMD | tee 1024node.dwf.small.cq
 
 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf
+$CMD | tee 1024node.dwf.cq
 
 
diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs
index 96f6143f..ee3cb381 100644
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -17,6 +17,7 @@ source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 
+
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
@@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 
-$CMD 
+#$CMD 
 
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 
+#$CMD 
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+$CMD 
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
 $CMD 
diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command
index e59ef515..689747c9 100644
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
 
diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh
index 7a2b3815..effb2d5d 100644
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -3,6 +3,19 @@
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
 
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+export FI_CXI_CQ_FILL_PERCENT=20
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
+
+#
+# -ftarget-register-alloc-mode=pvc:default 
+# -ftarget-register-alloc-mode=pvc:small
+# -ftarget-register-alloc-mode=pvc:large
+# -ftarget-register-alloc-mode=pvc:auto
+#
+
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128

From a46a0f088276de56a68d5020e9eee6875647c688 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 01:12:49 +0000
Subject: [PATCH 12/17] force device copyable and don't take crap from SYCL

---
 Grid/simd/Grid_vector_types.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h
index daf41cae..0a3d176f 100644
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 
 NAMESPACE_END(Grid);
 
+#ifdef GRID_SYCL
+template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
+#endif
+
+
 #endif

From 10116b3be8730507876336b92490a82d39000f50 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 01:13:27 +0000
Subject: [PATCH 13/17] Force device copyable and tell SYCL to shut it.

---
 Grid/tensors/Tensor_traits.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
index 98bc3986..536e17f1 100644
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid);
   };
 NAMESPACE_END(Grid);
 
+
+#ifdef GRID_SYCL
+template<typename T> struct
+sycl::is_device_copyable<T, typename std::enable_if<
+			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
+			      void>::type>
+  : public std::true_type {};
+#endif
+

From 891a366f73a5dfe9a6822c8d884c9b22c82de971 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 01:22:55 +0000
Subject: [PATCH 14/17] Repro CG script

---
 systems/Aurora/tests/repro16.pbs | 40 ++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 systems/Aurora/tests/repro16.pbs

diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs
new file mode 100644
index 00000000..28030a3d
--- /dev/null
+++ b/systems/Aurora/tests/repro16.pbs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=16
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 16 nodes, 192 ranks
+CMD="mpiexec -np 192 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+$CMD 

From b812a7b4c67e712ec1524fa1958c825057d1e27a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 6 Mar 2024 01:32:40 +0000
Subject: [PATCH 15/17] Staggered launch script

---
 systems/Aurora/tests/solver/stag16.pbs | 40 ++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 systems/Aurora/tests/solver/stag16.pbs

diff --git a/systems/Aurora/tests/solver/stag16.pbs b/systems/Aurora/tests/solver/stag16.pbs
new file mode 100644
index 00000000..5bfe04a6
--- /dev/null
+++ b/systems/Aurora/tests/solver/stag16.pbs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=16
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../../sourceme.sh
+
+cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 16 nodes, 192 ranks
+CMD="mpiexec -np 192 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
+	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+$CMD 

From 228bbb9d81a45cd08a3c49cfbe4f3a911a15ac5e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Mar 2024 19:03:35 +0100
Subject: [PATCH 16/17] Benchmark results

---
 .../Booster/benchmarks/Benchmark_usqcd.csv    | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 systems/Booster/benchmarks/Benchmark_usqcd.csv

diff --git a/systems/Booster/benchmarks/Benchmark_usqcd.csv b/systems/Booster/benchmarks/Benchmark_usqcd.csv
new file mode 100644
index 00000000..68689deb
--- /dev/null
+++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv
@@ -0,0 +1,70 @@
+Memory Bandwidth
+
+Bytes, GB/s per node
+3145728, 225.900365
+50331648, 2858.859504
+254803968, 4145.556367
+805306368, 4905.772480
+1966080000, 4978.312557
+
+
+GEMM
+
+ M, N, K, BATCH, GF/s per rank
+16, 8, 16, 256, 1.713639
+16, 16, 16, 256, 288.268316
+16, 32, 16, 256, 597.053950
+32, 8, 32, 256, 557.382591
+32, 16, 32, 256, 1100.145311
+32, 32, 32, 256, 1885.080449
+64, 8, 64, 256, 1725.163599
+64, 16, 64, 256, 3389.336566
+64, 32, 64, 256, 4168.252422
+16, 8, 256, 256, 1326.262134
+16, 16, 256, 256, 2318.095475
+16, 32, 256, 256, 3555.436503
+32, 8, 256, 256, 1920.139170
+32, 16, 256, 256, 3486.174753
+32, 32, 256, 256, 5320.821724
+64, 8, 256, 256, 2539.597502
+64, 16, 256, 256, 5003.456775
+64, 32, 256, 256, 7837.531562
+8, 256, 16, 256, 1427.848170
+16, 256, 16, 256, 2222.147815
+32, 256, 16, 256, 2877.121715
+8, 256, 32, 256, 1922.890086
+16, 256, 32, 256, 3199.469082
+32, 256, 32, 256, 4845.405343
+8, 256, 64, 256, 2639.483343
+16, 256, 64, 256, 5012.800299
+32, 256, 64, 256, 7216.006882
+
+
+
+Communications
+
+Packet bytes, direction, GB/s per node
+4718592, 2, 206.570734
+4718592, 3, 207.501847
+4718592, 6, 189.730277
+4718592, 7, 204.301218
+15925248, 2, 307.882997
+15925248, 3, 287.901076
+15925248, 6, 295.603109
+15925248, 7, 300.682033
+37748736, 2, 331.740364
+37748736, 3, 338.610627
+37748736, 6, 332.580657
+37748736, 7, 336.336579
+
+
+Per node summary table
+
+L , Wilson, DWF4, Staggered, GF/s per node
+
+8 , 16, 1165, 10
+12 , 473, 4901, 163
+16 , 1436, 8464, 442
+24 , 4133, 10139, 1530
+32 , 5726, 11487, 2518
+

From 7e5bd46dd3033aab62599c4cde1d1fc6bb7af8e7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 6 Mar 2024 19:03:45 +0100
Subject: [PATCH 17/17] Booster update

---
 Grid/algorithms/blas/BatchedBlas.h                   |  5 +++--
 .../implementation/StaggeredKernelsImplementation.h  | 12 +-----------
 systems/Booster/config-command                       |  6 ++++--
 systems/Booster/sourceme.sh                          | 10 +++++-----
 4 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index 5ab46333..f6418b7e 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
-#include <hipblas/hipblas.h>
+#include <cublas_v2.h>
 #endif
 #ifdef GRID_SYCL
 #include <oneapi/mkl.hpp>
@@ -51,7 +51,7 @@ NAMESPACE_BEGIN(Grid);
   typedef hipblasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_CUDA
-  typedef cudablasHandle_t gridblasHandle_t;
+  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
   typedef cl::sycl::queue *gridblasHandle_t;
@@ -78,6 +78,7 @@ public:
 #ifdef GRID_CUDA
       std::cout << "cublasCreate"<<std::endl;
       cublasCreate(&gridblasHandle);
+      cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
 #endif
 #ifdef GRID_HIP
       std::cout << "hipblasCreate"<<std::endl;
diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
index dd62e109..a39b529f 100644
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
 
   if( interior && exterior ) { 
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
-#ifndef GRID_CUDA
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
+#ifndef GRID_CUDA
     if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
   } else if( interior ) {
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
-#ifndef GRID_CUDA
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
-#endif
   } else if( exterior ) { 
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
-#ifndef GRID_CUDA
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
-#endif
   }
   assert(0 && " Kernel optimisation case not covered ");
 }
@@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
   
   if( interior && exterior ) { 
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
-#ifndef GRID_CUDA
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
-#endif
   } else if( interior ) {
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
-#ifndef GRID_CUDA
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
-#endif
   } else if( exterior ) { 
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
-#ifndef GRID_CUDA
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
-#endif
   }
 }
 
diff --git a/systems/Booster/config-command b/systems/Booster/config-command
index 8530c5f9..1ba2dc7a 100644
--- a/systems/Booster/config-command
+++ b/systems/Booster/config-command
@@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
     --enable-gen-simd-width=64 \
     --enable-shm=nvlink \
     --enable-accelerator=cuda \
+    --disable-gparity \
+    --disable-fermion-reps \
     --with-lime=$LIME \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
     --disable-unified \
     CXX=nvcc \
     LDFLAGS="-cudart shared " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"
 
diff --git a/systems/Booster/sourceme.sh b/systems/Booster/sourceme.sh
index 56499be4..2341267f 100644
--- a/systems/Booster/sourceme.sh
+++ b/systems/Booster/sourceme.sh
@@ -1,5 +1,5 @@
-module load GCC/9.3.0       
-module load  GMP/6.2.0   
-module load MPFR/4.1.0     
-module load OpenMPI/4.1.0rc1  
-module load CUDA/11.3
+module load GCC
+module load GMP
+module load MPFR
+module load OpenMPI
+module load CUDA