Staggered launch script

Repro CG script
Force device copyable and tell SYCL to shut it.
2025-11-01 12:34:32 +00:00 · 2024-03-06 01:32:40 +00:00 · 2024-03-06 01:22:55 +00:00 · 2024-03-06 01:13:27 +00:00 · 2024-03-06 01:12:49 +00:00 · 2024-03-06 00:05:00 +00:00
19 changed files with 310 additions and 139 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,7 +34,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-//#undef EIGEN_USE_SYCL
+#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif

--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */

-zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  construct_partfrac(d);
  construct_contfrac(d);

-  /* Converting everything to PRECISION for external use only */
+  /* Converting everything to ZOLO_PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }


-zolotarev_data* higham(PRECISION epsilon, int n) {
+zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
  /* Converting everything to PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST

 #undef ZERO
-#define ZERO ((PRECISION) 0)
+#define ZERO ((ZOLO_PRECISION) 0)
 #undef ONE
-#define ONE ((PRECISION) 1)
+#define ONE ((ZOLO_PRECISION) 1)
 #undef TWO
-#define TWO ((PRECISION) 2)
+#define TWO ((ZOLO_PRECISION) 2)

 /* Evaluate the rational approximation R(x) using the factored form */

-static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R;
+  ZOLO_PRECISION R;

  if (rdata -> type == 0) {
    R = rdata -> A * x;
@@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {

 /* Evaluate the rational approximation R(x) using the partial fraction form */

-static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R = rdata -> alpha[rdata -> da - 1];
+  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */

-static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R = rdata -> beta[0] * x;
+  ZOLO_PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    

 /* Evaluate the rational approximation R(x) using Cayley form */

-static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION T;
+  ZOLO_PRECISION T;

  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
-  PRECISION y;
+  ZOLO_PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;

@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
  }

  rdata = type == 2 
-    ? higham((PRECISION) eps, n) 
-    : zolotarev((PRECISION) eps, n, type);
+    ? higham((ZOLO_PRECISION) eps, n) 
+    : zolotarev((ZOLO_PRECISION) eps, n, type);

  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tPRECISION = " STRINGIFY(PRECISION)
+	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((PRECISION) x, rdata);
+      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>

 #ifndef ZOLOTAREV_INTERNAL
-#ifndef PRECISION
-#define PRECISION double
+#ifndef ZOLO_PRECISION
+#define ZOLO_PRECISION double
 #endif
-#define ZPRECISION PRECISION
+#define ZPRECISION ZOLO_PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif

@@ -77,8 +77,8 @@ typedef struct {
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */

-ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif

@@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
+
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -34,9 +34,14 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_SYCL
-#error // need oneMKL version
+#include <oneapi/mkl.hpp>
+#endif
+#if 0
+#define GRID_ONE_MKL
+#endif
+#ifdef GRID_ONE_MKL
+#include <oneapi/mkl.hpp>
 #endif
-
 ///////////////////////////////////////////////////////////////////////	  
 // Need to rearrange lattice data to be in the right format for a
 // batched multiply. Might as well make these static, dense packed
@@ -49,9 +54,12 @@ NAMESPACE_BEGIN(Grid);
  typedef cudablasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef int32_t gridblasHandle_t;
+  typedef cl::sycl::queue *gridblasHandle_t;
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+#ifdef GRID_ONE_MKL
+  typedef cl::sycl::queue *gridblasHandle_t;
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
 #endif

@@ -76,6 +84,12 @@ public:
      hipblasCreate(&gridblasHandle);
 #endif
 #ifdef GRID_SYCL
+      gridblasHandle = theGridAccelerator;
+#endif
+#ifdef GRID_ONE_MKL
+      cl::sycl::cpu_selector selector;
+      cl::sycl::device selectedDevice { selector };
+      gridblasHandle =new sycl::queue (selectedDevice);
 #endif
      gridblasInit=1;
    }
@@ -110,6 +124,9 @@ public:
 #endif
 #ifdef GRID_SYCL
    accelerator_barrier();
+#endif
+#ifdef GRID_ONE_MKL
+    gridblasHandle->wait();
 #endif
  }
  
@@ -644,10 +661,19 @@ public:
 			      (cuDoubleComplex *) Cmn, ldc, sdc,
 			      batchCount);
 #endif
-#ifdef GRID_SYCL
-     #warning "oneMKL implementation not made "
+#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
+    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						oneapi::mkl::transpose::N,
+						oneapi::mkl::transpose::N,
+						m,n,k,
+						alpha,
+						(const ComplexD *)Amk,lda,sda,
+						(const ComplexD *)Bkn,ldb,sdb,
+						beta,
+						(ComplexD *)Cmn,ldc,sdc,
+						batchCount);
 #endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
     // Need a default/reference implementation
     for (int p = 0; p < batchCount; ++p) {
       for (int mm = 0; mm < m; ++mm) {
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
+#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
-#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
  }
 }

-template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }

-#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;

 NAMESPACE_END(Grid);

--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -285,6 +285,7 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
  ComplexD nrm = rankInnerProduct(left,right);
+  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
  grid->GlobalSum(nrm);
  return nrm;
 }
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc

 NAMESPACE_END(Grid);

+#ifdef GRID_SYCL
+template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
+#endif
+
+
 #endif
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid);
  };
 NAMESPACE_END(Grid);

+
+#ifdef GRID_SYCL
+template<typename T> struct
+sycl::is_device_copyable<T, typename std::enable_if<
+			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
+			      void>::type>
+  : public std::true_type {};
+#endif
+
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -255,17 +255,13 @@ inline int  acceleratorIsCommunicable(void *ptr)
 #define GRID_SYCL_LEVEL_ZERO_IPC

 NAMESPACE_END(Grid);
-#if 0
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>
-#include <level_zero/ze_api.h>
-#include <CL/sycl/backend/level_zero.hpp>
-#else
+
+// Force deterministic reductions
+#define SYCL_REDUCTION_DETERMINISTIC
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
-#endif

 NAMESPACE_BEGIN(Grid);

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -393,6 +393,9 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;

+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;

  /////////////////////////////////////////////////////////
  // Reporting
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -219,7 +219,7 @@ public:
    uint64_t NN;


-  uint64_t lmax=32;
+  uint64_t lmax=40;
 #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)

    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
@@ -454,11 +454,17 @@ public:
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);

-      const int num_cases = 1;
+#ifdef AVX512
+      const int num_cases = 3;
+#else 
+      const int num_cases = 2;
+#endif      
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");

      controls Cases [] = {
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
+	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
+	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
      }; 

      for(int c=0;c<num_cases;c++) {
@@ -469,6 +475,10 @@ public:

 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

@@ -614,11 +624,13 @@ public:
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
    
-      const int num_cases = 1;
+      const int num_cases = 2;
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      
      controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
      }; 

      for(int c=0;c<num_cases;c++) {
@@ -847,11 +859,8 @@ int main (int argc, char ** argv)
  }

  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
-#ifdef KNL
-  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
-#else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-#endif
+
  Benchmark::Decomposition();

  int do_su4=0;
@@ -910,7 +919,7 @@ int main (int argc, char ** argv)
  }

  if ( do_blas ) {
-#if defined(GRID_CUDA) || defined(GRID_HIP)    
+#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1

 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
+export FI_CXI_CQ_FILL_PERCENT=10
+export FI_CXI_DEFAULT_CQ_SIZE=262144
+#export FI_CXI_DEFAULT_CQ_SIZE=131072
+#export FI_CXI_CQ_FILL_PERCENT=20

 # 12 ppn, 32 nodes, 384 ranks
 #
@@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf.small
+$CMD | tee 1024node.dwf.small.cq

 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf
+$CMD | tee 1024node.dwf.cq


--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -17,6 +17,7 @@ source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1

+
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
@@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"

-$CMD 
+#$CMD 

 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"

+#$CMD 
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+$CMD 
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
 $CMD 
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"

--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -3,6 +3,19 @@
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22

+export FI_CXI_DEFAULT_CQ_SIZE=131072
+export FI_CXI_CQ_FILL_PERCENT=20
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
+
+#
+# -ftarget-register-alloc-mode=pvc:default 
+# -ftarget-register-alloc-mode=pvc:small
+# -ftarget-register-alloc-mode=pvc:large
+# -ftarget-register-alloc-mode=pvc:auto
+#
+
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
--- a/systems/Aurora/tests/repro16.pbs
+++ b/systems/Aurora/tests/repro16.pbs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=16
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 16 nodes, 192 ranks
+CMD="mpiexec -np 192 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+$CMD 
--- a/systems/Aurora/tests/solver/stag16.pbs
+++ b/systems/Aurora/tests/solver/stag16.pbs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=16
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../../sourceme.sh
+
+cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 16 nodes, 192 ranks
+CMD="mpiexec -np 192 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
+	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+$CMD 
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -30,27 +30,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;

-template<class d>
-struct scal {
-  d internal;
-};
-
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
 int main (int argc, char ** argv)
 {
+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::string host(hostname);
+  
  Grid_init(&argc,&argv);

  const int Ls=12;

-  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
-
-  { 
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -92,7 +81,14 @@ int main (int argc, char ** argv)
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);

-  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
+  int nsecs=600;
+  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
+    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
+    GridCmdOptionInt(arg,nsecs);
+  }
+  
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
+
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
  double MdagMsiteflops = 1452; // Mobius (real coeffs)
@@ -101,7 +97,14 @@ int main (int argc, char ** argv)
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
-  for(int i=0;i<10;i++){
+
+  time_t start = time(NULL);
+
+  uint32_t csum, csumref;
+  csumref=0;
+  int iter=0;
+  do {
+    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
@@ -111,10 +114,28 @@ int main (int argc, char ** argv)
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
-  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
+
+    csum = crc(result_o);
+
+    if ( csumref == 0 ) {
+      csumref = csum;
+    } else {
+      if ( csum != csumref ) { 
+	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
+	assert(0);
+      } else {
+	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
+      }
+    }
+    iter ++;
+  } while (time(NULL) < (start + nsecs/2) );
+    
+  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<1;i++){
+  csumref=0;
+  int i=0;
+  do { 
+    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
@@ -122,46 +143,30 @@ int main (int argc, char ** argv)
    iters = CG.IterationsToComplete;
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
-    
+
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
-  
-  //  MemoryManager::Print();
+
+    csum = crc(result_o);
+
+    if ( csumref == 0 ) {
+      csumref = csum;
+    } else {
+      if ( csum != csumref ) { 
+	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
+	assert(0);
+      } else {
+	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
+      }
+    }
+    i++;
+  } while (time(NULL) < (start + nsecs) );

  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);

  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
-
-  #ifdef HAVE_LIME
-  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
+  assert(diff < 1e-4);
  
-  std::string file1("./Propagator1");
-  emptyUserRecord record;
-  uint32_t nersc_csum;
-  uint32_t scidac_csuma;
-  uint32_t scidac_csumb;
-  typedef SpinColourVectorD   FermionD;
-  typedef vSpinColourVectorD vFermionD;
-
-  BinarySimpleMunger<FermionD,FermionD> munge;
-  std::string format = getFormatString<vFermionD>();
-  
-  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-
-  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
-
-  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-
-  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
-  }
-  #endif
-  }
-  
-  MemoryManager::Print();
-
  Grid_finalize();
 }
Author	SHA1	Message	Date
Peter Boyle	b812a7b4c6	Staggered launch script	2024-03-06 01:32:40 +00:00
Peter Boyle	891a366f73	Repro CG script	2024-03-06 01:22:55 +00:00
Peter Boyle	10116b3be8	Force device copyable and tell SYCL to shut it.	2024-03-06 01:13:27 +00:00
Peter Boyle	a46a0f0882	force device copyable and don't take crap from SYCL	2024-03-06 01:12:49 +00:00
Peter Boyle	a26a8a38f4	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-06 00:05:00 +00:00
Peter Boyle	7435315d50	More blasted shell variables	2024-03-06 00:03:59 +00:00
Peter Boyle	9b5f741e85	Reproducing CG can be more useful now	2024-03-06 00:03:16 +00:00
Peter Boyle	517822fdd2	SPR HBM benchmarking right and also PVC batched GEMM	2024-03-06 00:02:27 +00:00
Peter Boyle	1b93a9be88	Print out the hostname	2024-03-06 00:01:58 +00:00
Peter Boyle	783a66b348	Deterministic reduction please	2024-03-06 00:01:37 +00:00
Peter Boyle	976c3e9b59	Hack for flight logging CG inner products. Can be made to work, but could put in some more serious infrastructure for repro testing and blame attribution (Britney test) if necessary	2024-03-05 23:59:57 +00:00
Peter Boyle	f8ca971dae	Use of a bare PRECISION macro is not namespace safe and collides with SYCL	2024-03-05 23:59:13 +00:00
Peter Boyle	21bc8c24df	OneMKL batched blas starting	2024-03-05 23:58:20 +00:00
Peter Boyle	30228214f7	SYCL conflict with Eigen	2024-03-05 23:56:10 +00:00