more HOST_NAME_MAX fix

fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined
Booster update
2025-11-11 17:19:31 +00:00 · 2024-03-07 15:26:01 +09:00 · 2024-03-07 15:22:08 +09:00 · 2024-03-06 19:03:45 +01:00 · 2024-03-06 19:03:35 +01:00 · 2024-03-06 01:32:40 +00:00
42 changed files with 2699 additions and 226 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,7 +34,7 @@
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #define EIGEN_DONT_VECTORIZE
-//#undef EIGEN_USE_SYCL
+#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif

--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 * type = 1 for the approximation which is infinite at x = 0. */

-zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
+zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
    l, invlambda, xi, xisq, *tv, s, opl;
  int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  construct_partfrac(d);
  construct_contfrac(d);

-  /* Converting everything to PRECISION for external use only */
+  /* Converting everything to ZOLO_PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 }


-zolotarev_data* higham(PRECISION epsilon, int n) {
+zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
  int m, czero;
  zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
  /* Converting everything to PRECISION for external use only */

  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
-  zd -> A = (PRECISION) d -> A;
-  zd -> Delta = (PRECISION) d -> Delta;
-  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> A = (ZOLO_PRECISION) d -> A;
+  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
+  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
  zd -> n = d -> n;
  zd -> type = d -> type;
  zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
  zd -> deg_num = d -> deg_num;
  zd -> deg_denom = d -> deg_denom;

-  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
-  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
  free(d -> a);

-  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
-  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
  free(d -> ap);

-  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
-  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
  free(d -> alpha);

-  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
-  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
  free(d -> beta);

-  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
-  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
  free(d -> gamma);

  free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 #ifdef TEST

 #undef ZERO
-#define ZERO ((PRECISION) 0)
+#define ZERO ((ZOLO_PRECISION) 0)
 #undef ONE
-#define ONE ((PRECISION) 1)
+#define ONE ((ZOLO_PRECISION) 1)
 #undef TWO
-#define TWO ((PRECISION) 2)
+#define TWO ((ZOLO_PRECISION) 2)

 /* Evaluate the rational approximation R(x) using the factored form */

-static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R;
+  ZOLO_PRECISION R;

  if (rdata -> type == 0) {
    R = rdata -> A * x;
@@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {

 /* Evaluate the rational approximation R(x) using the partial fraction form */

-static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R = rdata -> alpha[rdata -> da - 1];
+  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
  for (m = 0; m < rdata -> dd; m++)
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 * but with signalling overflow you will get an error message. */

-static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION R = rdata -> beta[0] * x;
+  ZOLO_PRECISION R = rdata -> beta[0] * x;
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
  return R;
 }    

 /* Evaluate the rational approximation R(x) using Cayley form */

-static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
+static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
  int m;
-  PRECISION T;
+  ZOLO_PRECISION T;

  T = rdata -> type == 0 ? ONE : -ONE;
  for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
  int m, n, plotpts = 5000, type = 0;
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
  zolotarev_data *rdata;
-  PRECISION y;
+  ZOLO_PRECISION y;
  FILE *plot_function, *plot_error, 
    *plot_partfrac, *plot_contfrac, *plot_cayley;

@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
  }

  rdata = type == 2 
-    ? higham((PRECISION) eps, n) 
-    : zolotarev((PRECISION) eps, n, type);
+    ? higham((ZOLO_PRECISION) eps, n) 
+    : zolotarev((ZOLO_PRECISION) eps, n, type);

  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
-	 "\tPRECISION = " STRINGIFY(PRECISION)
+	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 	 "\tDelta = %g (maximum error)\n\n"
 	 "\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
    x = 2.4 * (float) m / plotpts - 1.2;
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
      /* skip x = 0 for type 1, as R(0) is singular */
-      y = zolotarev_eval((PRECISION) x, rdata);
+      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
      fprintf(plot_function, "%g %g\n", x, (float) y);
      fprintf(plot_error, "%g %g\n",
 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
-      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
+      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
+      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
-      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
+      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
 		       / rdata -> Delta);
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 	maxypferr = MAX(maxypferr, fabs(ypferr));
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>

 #ifndef ZOLOTAREV_INTERNAL
-#ifndef PRECISION
-#define PRECISION double
+#ifndef ZOLO_PRECISION
+#define ZOLO_PRECISION double
 #endif
-#define ZPRECISION PRECISION
+#define ZPRECISION ZOLO_PRECISION
 #define ZOLOTAREV_DATA zolotarev_data
 #endif

@@ -77,8 +77,8 @@ typedef struct {
 * zolotarev_data structure. The arguments must satisfy the constraints that
 * epsilon > 0, n > 0, and type = 0 or 1. */

-ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
-ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
+ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
 void zolotarev_free(zolotarev_data *zdata);
 #endif

@@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
 NAMESPACE_END(Approx);
 NAMESPACE_END(Grid);
 #endif
+
--- a/Grid/algorithms/blas/BatchedBlas.cc
+++ b/Grid/algorithms/blas/BatchedBlas.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: BatchedBlas.h
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/GridCore.h>
+#include <Grid/algorithms/blas/BatchedBlas.h>
+NAMESPACE_BEGIN(Grid);
+gridblasHandle_t GridBLAS::gridblasHandle;
+int              GridBLAS::gridblasInit;
+NAMESPACE_END(Grid);
+
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -0,0 +1,727 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: BatchedBlas.h
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#ifdef GRID_HIP
+#include <hipblas/hipblas.h>
+#endif
+#ifdef GRID_CUDA
+#include <cublas_v2.h>
+#endif
+#ifdef GRID_SYCL
+#include <oneapi/mkl.hpp>
+#endif
+#if 0
+#define GRID_ONE_MKL
+#endif
+#ifdef GRID_ONE_MKL
+#include <oneapi/mkl.hpp>
+#endif
+///////////////////////////////////////////////////////////////////////	  
+// Need to rearrange lattice data to be in the right format for a
+// batched multiply. Might as well make these static, dense packed
+///////////////////////////////////////////////////////////////////////
+NAMESPACE_BEGIN(Grid);
+#ifdef GRID_HIP
+  typedef hipblasHandle_t gridblasHandle_t;
+#endif
+#ifdef GRID_CUDA
+  typedef cublasHandle_t gridblasHandle_t;
+#endif
+#ifdef GRID_SYCL
+  typedef cl::sycl::queue *gridblasHandle_t;
+#endif
+#ifdef GRID_ONE_MKL
+  typedef cl::sycl::queue *gridblasHandle_t;
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
+  typedef int32_t gridblasHandle_t;
+#endif
+
+enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
+
+class GridBLAS {
+public:
+
+  
+  static gridblasHandle_t gridblasHandle;
+  static int            gridblasInit;
+  
+  static void Init(void)
+  {
+    if ( ! gridblasInit ) {
+#ifdef GRID_CUDA
+      std::cout << "cublasCreate"<<std::endl;
+      cublasCreate(&gridblasHandle);
+      cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
+#endif
+#ifdef GRID_HIP
+      std::cout << "hipblasCreate"<<std::endl;
+      hipblasCreate(&gridblasHandle);
+#endif
+#ifdef GRID_SYCL
+      gridblasHandle = theGridAccelerator;
+#endif
+#ifdef GRID_ONE_MKL
+      cl::sycl::cpu_selector selector;
+      cl::sycl::device selectedDevice { selector };
+      gridblasHandle =new sycl::queue (selectedDevice);
+#endif
+      gridblasInit=1;
+    }
+  }
+  
+  // Force construct once
+  GridBLAS() { Init(); };
+  ~GridBLAS() { };
+  
+  /////////////////////////////////////////////////////////////////////////////////////
+  // BLAS GEMM conventions:
+  /////////////////////////////////////////////////////////////////////////////////////
+  // - C = alpha A * B + beta C
+  // Dimensions:
+  // - C_m.n
+  // - A_m.k
+  // - B_k.n
+  // - Flops = 8 M N K
+  // - Bytes = 2*sizeof(word) * (MN+MK+KN)
+  // M=60, N=12
+  // Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
+  /////////////////////////////////////////////////////////////////////////////////////
+  void synchronise(void)
+  {
+#ifdef GRID_HIP
+    auto err = hipDeviceSynchronize();
+    assert(err==hipSuccess);
+#endif
+#ifdef GRID_CUDA
+    auto err = cudaDeviceSynchronize();
+    assert(err==cudaSuccess);
+#endif
+#ifdef GRID_SYCL
+    accelerator_barrier();
+#endif
+#ifdef GRID_ONE_MKL
+    gridblasHandle->wait();
+#endif
+  }
+  
+  void gemmBatched(int m,int n, int k,
+		   ComplexD alpha,
+		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexD*> &Bkn,
+		   ComplexD beta,
+		   deviceVector<ComplexD*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+  void gemmBatched(int m,int n, int k,
+		   ComplexF alpha,
+		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexF*> &Bkn,
+		   ComplexF beta,
+		   deviceVector<ComplexF*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+  void gemmBatched(int m,int n, int k,
+		   RealD alpha,
+		   deviceVector<RealD*> &Amk,  // pointer list to matrices
+		   deviceVector<RealD*> &Bkn,
+		   RealD beta,
+		   deviceVector<RealD*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+  void gemmBatched(int m,int n, int k,
+		   RealF alpha,
+		   deviceVector<RealF*> &Amk,  // pointer list to matrices
+		   deviceVector<RealF*> &Bkn,
+		   RealF beta,
+		   deviceVector<RealF*> &Cmn)
+  {
+    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
+		m,n,k,
+		alpha,
+		Amk,
+		Bkn,
+		beta,
+		Cmn);
+  }
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   ComplexD alpha,
+		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexD*> &Bkn,
+		   ComplexD beta,
+		   deviceVector<ComplexD*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    
+    static deviceVector<ComplexD> alpha_p(1);
+    static deviceVector<ComplexD> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
+    RealD t0=usecond();
+    //    std::cout << "ZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasZgemmBatched(gridblasHandle,
+				   hOpA,
+				   hOpB,
+				   m,n,k,
+				   (hipblasDoubleComplex *) &alpha_p[0],
+				   (hipblasDoubleComplex **)&Amk[0], lda,
+				   (hipblasDoubleComplex **)&Bkn[0], ldb,
+				   (hipblasDoubleComplex *) &beta_p[0],
+				   (hipblasDoubleComplex **)&Cmn[0], ldc,
+				   batchCount);
+    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasZgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (cuDoubleComplex *) &alpha_p[0],
+				  (cuDoubleComplex **)&Amk[0], lda,
+				  (cuDoubleComplex **)&Bkn[0], ldb,
+				  (cuDoubleComplex *) &beta_p[0],
+				  (cuDoubleComplex **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation
+    int sda = lda*k;
+    int sdb = ldb*k;
+    int sdc = ldc*n;
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  ComplexD c_mn(0.0);
+	  for (int kk = 0; kk < k; ++kk)
+	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	}
+      }
+    }
+#endif
+    //    synchronise();
+     RealD t1=usecond();
+     RealD flops = 8.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
+     //     std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+  }
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   ComplexF alpha,
+		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
+		   deviceVector<ComplexF*> &Bkn,
+		   ComplexF beta,
+		   deviceVector<ComplexF*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    static deviceVector<ComplexF> alpha_p(1);
+    static deviceVector<ComplexF> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
+    RealD t0=usecond();
+
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasCgemmBatched(gridblasHandle,
+				   hOpA,
+				   hOpB,
+				   m,n,k,
+				   (hipblasComplex *) &alpha_p[0],
+				   (hipblasComplex **)&Amk[0], lda,
+				   (hipblasComplex **)&Bkn[0], ldb,
+				   (hipblasComplex *) &beta_p[0],
+				   (hipblasComplex **)&Cmn[0], ldc,
+				   batchCount);
+
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasCgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (cuComplex *) &alpha_p[0],
+				  (cuComplex **)&Amk[0], lda,
+				  (cuComplex **)&Bkn[0], ldb,
+				  (cuComplex *) &beta_p[0],
+				  (cuComplex **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    int sda = lda*k;
+    int sdb = ldb*k;
+    int sdc = ldc*n;
+    ComplexF alphaf(real(alpha),imag(alpha));
+    ComplexF betaf(real(beta),imag(beta));
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  ComplexF c_mn(0.0);
+	  for (int kk = 0; kk < k; ++kk)
+	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
+	}
+      }
+    }
+#endif
+     RealD t1=usecond();
+     RealD flops = 8.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
+  }
+  
+  ///////////////////////////////////////////////////////////////////////////
+  // Single precision real GEMM
+  ///////////////////////////////////////////////////////////////////////////
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   RealF alpha,
+		   deviceVector<RealF*> &Amk,  // pointer list to matrices
+		   deviceVector<RealF*> &Bkn,
+		   RealF beta,
+		   deviceVector<RealF*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    static deviceVector<RealF> alpha_p(1);
+    static deviceVector<RealF> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
+    RealD t0=usecond();
+
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasSgemmBatched(gridblasHandle,
+				   hOpA,
+				   hOpB,
+				   m,n,k,
+				   (float *) &alpha_p[0],
+				   (float **)&Amk[0], lda,
+				   (float **)&Bkn[0], ldb,
+				   (float *) &beta_p[0],
+				   (float **)&Cmn[0], ldc,
+				   batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasSgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (float *) &alpha_p[0],
+				  (float **)&Amk[0], lda,
+				  (float **)&Bkn[0], ldb,
+				  (float *) &beta_p[0],
+				  (float **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    int sda = lda*k;
+    int sdb = ldb*k;
+    int sdc = ldc*n;
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  RealD c_mn(0.0);
+	  for (int kk = 0; kk < k; ++kk)
+	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	}
+      }
+    }
+#endif
+     RealD t1=usecond();
+     RealD flops = 2.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
+  }
+  
+  
+  ///////////////////////////////////////////////////////////////////////////
+  // Double precision real GEMM
+  ///////////////////////////////////////////////////////////////////////////
+
+  void gemmBatched(GridBLASOperation_t OpA,
+		   GridBLASOperation_t OpB,
+		   int m,int n, int k,
+		   RealD alpha,
+		   deviceVector<RealD*> &Amk,  // pointer list to matrices
+		   deviceVector<RealD*> &Bkn,
+		   RealD beta,
+		   deviceVector<RealD*> &Cmn)
+  {
+    RealD t2=usecond();
+    int32_t batchCount = Amk.size();
+
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    if(OpA!=GridBLAS_OP_N)
+      lda = k;
+    if(OpB!=GridBLAS_OP_N)
+      ldb = n;
+    
+    static deviceVector<RealD> alpha_p(1);
+    static deviceVector<RealD> beta_p(1);
+    // can prestore the 1 and the zero on device
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
+    RealD t0=usecond();
+
+    assert(Bkn.size()==batchCount);
+    assert(Cmn.size()==batchCount);
+#ifdef GRID_HIP
+    hipblasOperation_t hOpA;
+    hipblasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+    auto err = hipblasDgemmBatched(gridblasHandle,
+				   HIPBLAS_OP_N,
+				   HIPBLAS_OP_N,
+				   m,n,k,
+				   (double *) &alpha_p[0],
+				   (double **)&Amk[0], lda,
+				   (double **)&Bkn[0], ldb,
+				   (double *) &beta_p[0],
+				   (double **)&Cmn[0], ldc,
+				   batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasOperation_t hOpA;
+    cublasOperation_t hOpB;
+    if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
+    if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
+    if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
+    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
+    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
+    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
+    auto err = cublasDgemmBatched(gridblasHandle,
+				  hOpA,
+				  hOpB,
+				  m,n,k,
+				  (double *) &alpha_p[0],
+				  (double **)&Amk[0], lda,
+				  (double **)&Bkn[0], ldb,
+				  (double *) &beta_p[0],
+				  (double **)&Cmn[0], ldc,
+				  batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    /*
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
+      onemkl::transpose::N,
+      onemkl::transpose::N,
+      &m64,&n64,&k64,
+      (double *) &alpha_p[0],
+      (double **)&Amk[0], lda,
+      (double **)&Bkn[0], ldb,
+      (double *) &beta_p[0],
+      (double **)&Cmn[0], ldc,
+      1,&batchCount64);
+     */
+    //MKL’s cblas_<T>gemm_batch & OneAPI
+#warning "oneMKL implementation not built "
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    int sda = lda*k;
+    int sdb = ldb*k;
+    int sdc = ldc*n;
+    // Need a default/reference implementation
+    for (int p = 0; p < batchCount; ++p) {
+      for (int mm = 0; mm < m; ++mm) {
+	for (int nn = 0; nn < n; ++nn) {
+	  RealD c_mn(0.0);
+	  for (int kk = 0; kk < k; ++kk)
+	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	}
+      }
+    }
+#endif
+     RealD t1=usecond();
+     RealD flops = 2.0*m*n*k*batchCount;
+     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
+  }
+  
+
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // Strided case used by benchmark, but generally unused in Grid
+  // Keep a code example in double complex, but don't generate the single and real variants for now
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  
+  void gemmStridedBatched(int m,int n, int k,
+			  ComplexD alpha,
+			  ComplexD* Amk,  // pointer list to matrices
+			  ComplexD* Bkn,
+			  ComplexD beta,
+			  ComplexD* Cmn,
+			  int batchCount)
+  {
+    // Use C-row major storage, so transpose calls
+    int lda = m; // m x k column major
+    int ldb = k; // k x n column major
+    int ldc = m; // m x b column major
+    int sda = m*k;
+    int sdb = k*n;
+    int sdc = m*n;
+    deviceVector<ComplexD> alpha_p(1);
+    deviceVector<ComplexD> beta_p(1);
+    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
+    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
+
+    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
+    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
+    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
+#ifdef GRID_HIP
+    auto err = hipblasZgemmStridedBatched(gridblasHandle,
+					  HIPBLAS_OP_N,
+					  HIPBLAS_OP_N,
+					  m,n,k,
+					  (hipblasDoubleComplex *) &alpha_p[0],
+					  (hipblasDoubleComplex *) Amk, lda, sda,
+					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
+					  (hipblasDoubleComplex *) &beta_p[0],
+					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
+					  batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    cublasZgemmStridedBatched(gridblasHandle,
+			      CUBLAS_OP_N,
+			      CUBLAS_OP_N,
+			      m,n,k,
+			      (cuDoubleComplex *) &alpha_p[0],
+			      (cuDoubleComplex *) Amk, lda, sda,
+			      (cuDoubleComplex *) Bkn, ldb, sdb,
+			      (cuDoubleComplex *) &beta_p[0],
+			      (cuDoubleComplex *) Cmn, ldc, sdc,
+			      batchCount);
+#endif
+#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
+    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						oneapi::mkl::transpose::N,
+						oneapi::mkl::transpose::N,
+						m,n,k,
+						alpha,
+						(const ComplexD *)Amk,lda,sda,
+						(const ComplexD *)Bkn,ldb,sdb,
+						beta,
+						(ComplexD *)Cmn,ldc,sdc,
+						batchCount);
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
+     // Need a default/reference implementation
+     for (int p = 0; p < batchCount; ++p) {
+       for (int mm = 0; mm < m; ++mm) {
+	 for (int nn = 0; nn < n; ++nn) {
+	   ComplexD c_mn(0.0);
+	   for (int kk = 0; kk < k; ++kk)
+	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
+	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
+	 }
+       }
+     }
+#endif
+  }
+
+  double benchmark(int M, int N, int K, int BATCH)
+  {
+    int32_t N_A = M*K*BATCH;
+    int32_t N_B = K*N*BATCH;
+    int32_t N_C = M*N*BATCH;
+    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
+    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
+    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
+    ComplexD alpha(1.0);
+    ComplexD beta (1.0);
+    RealD flops = 8.0*M*N*K*BATCH;
+    int ncall=10;
+    RealD t0 = usecond();
+    for(int i=0;i<ncall;i++){
+      gemmStridedBatched(M,N,K,
+			 alpha,
+			 &A[0], // m x k 
+			 &B[0], // k x n
+			 beta, 
+			 &C[0], // m x n
+			 BATCH);
+    }
+    synchronise();
+    RealD t1 = usecond();
+    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
+    flops = 8.0*M*N*K*BATCH*ncall;
+    flops = flops/(t1-t0)/1.e3;
+    return flops; // Returns gigaflops
+  }
+
+
+
+
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -176,6 +176,7 @@ template<class T> using cshiftAllocator = std::allocator<T>;
 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
 template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;

 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
+#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 #include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
@@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
-#include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-#if ( (!defined(GRID_CUDA)) )
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
  }
 }

-template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }

-#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;

 NAMESPACE_END(Grid);

--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
+#include <Grid/lattice/Lattice_slicesum_core.h>

 NAMESPACE_BEGIN(Grid);

@@ -284,6 +285,7 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
  ComplexD nrm = rankInnerProduct(left,right);
+  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
  grid->GlobalSum(nrm);
  return nrm;
 }
@@ -448,19 +450,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
-
-  // sum over reduced dimension planes, breaking out orthog dir
-  // Parallel over orthog direction
-  autoView( Data_v, Data, CpuRead);
-  thread_for( r,rd, {
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-	int ss= so+n*stride+b;
-	lvSum[r]=lvSum[r]+Data_v[ss];
-      }
-    }
-  });
+  int ostride=grid->_ostride[orthogdim];
+  
+  //Reduce Data down to lvSum
+  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);

  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
@@ -504,6 +497,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
  return result;
 }

+
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -0,0 +1,213 @@
+#pragma once
+#include <type_traits>
+#if defined(GRID_CUDA)
+
+#include <cub/cub.cuh>
+#define gpucub cub
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+
+#elif defined(GRID_HIP)
+
+#include <hipcub/hipcub.hpp>
+#define gpucub hipcub
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+
+#endif
+
+
+NAMESPACE_BEGIN(Grid);
+
+
+#if defined(GRID_CUDA) || defined(GRID_HIP)
+template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+  size_t subvol_size = e1*e2;
+  commVector<vobj> reduction_buffer(rd*subvol_size);
+  auto rb_p = &reduction_buffer[0];
+  vobj zero_init;
+  zeroit(zero_init);
+
+  
+  void *temp_storage_array = NULL;
+  size_t temp_storage_bytes = 0;
+  vobj *d_out;
+  int* d_offsets;
+
+  std::vector<int> offsets(rd+1,0);
+
+  for (int i = 0; i < offsets.size(); i++) {
+    offsets[i] = i*subvol_size;
+  }
+  
+  //Allocate memory for output and offset arrays on device
+  d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
+  
+  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
+  
+  //copy offsets to device
+  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  
+  
+  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
+  if (gpuErr!=gpuSuccess) {
+    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //allocate memory for temp_storage_array  
+  temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
+  
+  //prepare buffer for reduction
+  //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
+  //use 2d accelerator_for to avoid launch latencies found when serially looping over rd 
+  accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{ 
+  
+    int n = s / e2;
+    int b = s % e2;
+    int so=r*ostride; // base offset for start of plane 
+    int ss= so+n*stride+b;
+
+    coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
+
+  });
+  
+  //issue segmented reductions in computeStream
+  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
+  if (gpuErr!=gpuSuccess) {
+    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
+    exit(EXIT_FAILURE);
+  }
+  
+  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  
+  //sync after copy
+  accelerator_barrier();
+ 
+  acceleratorFreeDevice(temp_storage_array);
+  acceleratorFreeDevice(d_out);
+  acceleratorFreeDevice(d_offsets);
+  
+
+}
+
+template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+  typedef typename vobj::vector_type vector;
+  const int words = sizeof(vobj)/sizeof(vector);
+  const int osites = rd*e1*e2;
+  commVector<vector>buffer(osites);
+  vector *dat = (vector *)Data;
+  vector *buf = &buffer[0];
+  Vector<vector> lvSum_small(rd);
+  vector *lvSum_ptr = (vector *)&lvSum[0];
+
+  for (int w = 0; w < words; w++) {
+    accelerator_for(ss,osites,1,{
+	    buf[ss] = dat[ss*words+w];
+    });
+
+    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+      
+    for (int r = 0; r < rd; r++) {
+      lvSum_ptr[w+words*r]=lvSum_small[r];
+    }
+
+  }
+
+  
+}
+
+template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+{
+  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
+    if constexpr (sizeof(vobj) <= 256) { 
+      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+    }
+    else {
+      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+    }
+}
+#endif
+
+
+#if defined(GRID_SYCL)
+template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+{
+  typedef typename vobj::scalar_object sobj;
+  size_t subvol_size = e1*e2;
+
+  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
+  vobj vobj_zero;
+  zeroit(vobj_zero);
+    
+  commVector<vobj> reduction_buffer(rd*subvol_size);    
+
+  auto rb_p = &reduction_buffer[0];
+
+  autoView(Data_v, Data, AcceleratorRead);
+
+  //prepare reduction buffer 
+  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
+  
+      int n = s / e2;
+      int b = s % e2;
+      int so=r*ostride; // base offset for start of plane 
+      int ss= so+n*stride+b;
+
+      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
+
+  });
+
+  for (int r = 0; r < rd; r++) {
+      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
+      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
+          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+          Reduction,
+          [=](cl::sycl::id<1> item, auto &sum) {
+              auto s = item[0];
+              sum += rb_p[r*subvol_size+s];
+          });
+      });
+      theGridAccelerator->wait();
+      lvSum[r] = mysum[0];
+  }
+  
+  free(mysum,*theGridAccelerator);
+}
+#endif
+
+template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+{
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  autoView( Data_v, Data, CpuRead);
+  thread_for( r,rd, {
+    int so=r*ostride; // base offset for start of plane 
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int ss= so+n*stride+b;
+        lvSum[r]=lvSum[r]+Data_v[ss];
+      }
+    }
+  });
+}
+
+template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
+{
+  #if defined(GRID_CUDA) || defined(GRID_HIP)
+  
+  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  
+  #elif defined(GRID_SYCL)
+  
+  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  
+  #else
+  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+
+  #endif
+}
+
+
+NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

-  vobj zz = Zero();
-  
  accelerator_for(sc,coarse->oSites(),1,{

      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      vobj cd = zz;
+      vobj cd = Zero();
      
      for(int sb=0;sb<blockVol;sb++){

--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -45,6 +45,7 @@ public:
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
+  vobj* getHostPointer(void) const { return _odata; };
 };

 /////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@@ -34,7 +34,7 @@ class GridTracer {
 };
 inline void tracePush(const char *name) { roctxRangePushA(name); }
 inline void tracePop(const char *name) { roctxRangePop(); }
-inline int  traceStart(const char *name) { roctxRangeStart(name); }
+inline int  traceStart(const char *name) { return roctxRangeStart(name); }
 inline void traceStop(int ID) { roctxRangeStop(ID); }
 #endif

--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -63,7 +63,9 @@ public:
  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
-
+  virtual void M(const FermionField &in, FermionField &out) ;
+  virtual void Mdag(const FermionField &in, FermionField &out) ;
+  
 private:
  RealD mu; // TwistedMass parameter

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,

  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
+#ifndef GRID_CUDA
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
-#endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
-#endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
@@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
-#endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
-#endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
-#endif
  }
 }

--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@@ -93,5 +93,25 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
  RealD b    = tm /sq;
  axpibg5x(out,in,a,b);
 }
+template<class Impl>
+void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerNo);
+  FermionField tmp(out.Grid());
+  RealD a = 4.0+this->mass;
+  RealD b = this->mu;
+  axpibg5x(tmp,in,a,b);
+  axpy(out, 1.0, tmp, out);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerYes);
+  FermionField tmp(out.Grid());
+  RealD a = 4.0+this->mass;
+  RealD b = -this->mu;
+  axpibg5x(tmp,in,a,b);
+  axpy(out, 1.0, tmp, out);
+}

 NAMESPACE_END(Grid);
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -237,7 +237,7 @@ public:

    for (int level = 0; level < as.size(); ++level) {
      int multiplier = as.at(level).multiplier;
-      ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier);
+      ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
      Level->push_back(new EmptyAction<Field>); 
      LevelForces.push_back(*Level);
      // does it copy by value or reference??
--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -138,7 +138,7 @@ public:
        for(int nu=0;nu<Nd;nu++) {
            appendShift(shifts,mu);
            appendShift(shifts,nu);
-            appendShift(shifts,NO_SHIFT);
+            appendShift(shifts,shiftSignal::NO_SHIFT);
            appendShift(shifts,mu,Back(nu));
            appendShift(shifts,Back(nu));
            appendShift(shifts,Back(mu));
@@ -173,7 +173,6 @@ public:
            auto gStencil_v = gStencil.View(); 

            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs
-//            for(int site=0;site<Nsites;site++){ // ----------- 3-link constructs
                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
                U3matrix U0, U1, U2, U3, U4, U5, W;
                for(int nu=0;nu<Nd;nu++) {
@@ -216,7 +215,6 @@ public:
            })

            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link 
-//            for(int site=0;site<Nsites;site++){ // ----------- 5-link
                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
                U3matrix U0, U1, U2, U3, U4, U5, W;
                int sigmaIndex = 0;
@@ -254,7 +252,6 @@ public:
            })

            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link
-//            for(int site=0;site<Nsites;site++){ // ----------- 7-link
                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
                U3matrix U0, U1, U2, U3, U4, U5, W;
                int sigmaIndex = 0;
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc

 NAMESPACE_END(Grid);

+#ifdef GRID_SYCL
+template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
+template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
+#endif
+
+
 #endif
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -141,8 +141,14 @@ public:
 ////////////////////////////////////////////////
 // Some machinery to streamline making a stencil 
 ////////////////////////////////////////////////
-#define BACKWARD_CONST 16
-#define NO_SHIFT -1
+
+class shiftSignal {
+public:
+    enum {
+        BACKWARD_CONST = 16,
+        NO_SHIFT       = -1
+    };
+};

 // TODO: put a check somewhere that BACKWARD_CONST > Nd!

@@ -150,16 +156,16 @@ public:
 inline int Back(const int dir) {
    // generalShift will use BACKWARD_CONST to determine whether we step forward or 
    // backward. Trick inspired by SIMULATeQCD. 
-    return dir + BACKWARD_CONST;
+    return dir + shiftSignal::BACKWARD_CONST;
 }

 /*!  @brief shift one unit in direction dir */
 template<typename... Args>
 void generalShift(Coordinate& shift, int dir) {
-    if (dir >= BACKWARD_CONST) {
-        dir -= BACKWARD_CONST;
+    if (dir >= shiftSignal::BACKWARD_CONST) {
+        dir -= shiftSignal::BACKWARD_CONST;
        shift[dir]+=-1;
-    } else if (dir == NO_SHIFT) {
+    } else if (dir == shiftSignal::NO_SHIFT) {
        ; // do nothing
    } else {
        shift[dir]+=1;
@@ -169,10 +175,10 @@ void generalShift(Coordinate& shift, int dir) {
 /*!  @brief follow a path of directions, shifting one unit in each direction */
 template<typename... Args>
 void generalShift(Coordinate& shift, int dir, Args... args) {
-    if (dir >= BACKWARD_CONST) {
-        dir -= BACKWARD_CONST;
+    if (dir >= shiftSignal::BACKWARD_CONST) {
+        dir -= shiftSignal::BACKWARD_CONST;
        shift[dir]+=-1;
-    } else if (dir == NO_SHIFT) {
+    } else if (dir == shiftSignal::NO_SHIFT) {
        ; // do nothing
    } else {
        shift[dir]+=1;
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -706,7 +706,7 @@ public:
 	}
      }
    }
-    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -761,7 +761,8 @@ public:
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
-		   Parameters p=Parameters())
+		   Parameters p=Parameters(),
+		   bool preserve_shm=false)
  {
    face_table_computed=0;
    _grid    = grid;
@@ -855,7 +856,9 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();

-    _grid->ShmBufferFreeAll();
+    // Allow for multiple stencils to exist simultaneously
+    if (!preserve_shm)
+      _grid->ShmBufferFreeAll();

    int maxl=2;
    u_simd_send_buf.resize(maxl);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid);
  };
 NAMESPACE_END(Grid);

+
+#ifdef GRID_SYCL
+template<typename T> struct
+sycl::is_device_copyable<T, typename std::enable_if<
+			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
+			      void>::type>
+  : public std::true_type {};
+#endif
+
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
+inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
@@ -253,17 +255,13 @@ inline int  acceleratorIsCommunicable(void *ptr)
 #define GRID_SYCL_LEVEL_ZERO_IPC

 NAMESPACE_END(Grid);
-#if 0
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>
-#include <level_zero/ze_api.h>
-#include <CL/sycl/backend/level_zero.hpp>
-#else
+
+// Force deterministic reductions
+#define SYCL_REDUCTION_DETERMINISTIC
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
-#endif

 NAMESPACE_BEGIN(Grid);

@@ -287,23 +285,24 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      unsigned long nt=acceleratorThreads();				\
-      unsigned long unum1 = num1;					\
-      unsigned long unum2 = num2;					\
-      if(nt < 8)nt=8;							\
-      cl::sycl::range<3> local {nt,1,nsimd};				\
-      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
-      cgh.parallel_for(					\
-      cl::sycl::nd_range<3>(global,local), \
-      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
-      [[intel::reqd_sub_group_size(16)]]	      \
-      {						      \
-      auto iter1    = item.get_global_id(0);	      \
-      auto iter2    = item.get_global_id(1);	      \
-      auto lane     = item.get_global_id(2);	      \
-      { __VA_ARGS__ };				      \
-     });	   			              \
-    });
+    unsigned long nt=acceleratorThreads();				\
+    if(nt < 8)nt=8;							\
+    unsigned long unum1 = num1;						\
+    unsigned long unum2 = num2;						\
+    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
+    cl::sycl::range<3> local {nt,1,nsimd};				\
+    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+    cgh.parallel_for(							\
+		     cl::sycl::nd_range<3>(global,local),		\
+		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		     [[intel::reqd_sub_group_size(16)]]			\
+		     {							\
+		       auto iter1    = item.get_global_id(0);		\
+		       auto iter2    = item.get_global_id(1);		\
+		       auto lane     = item.get_global_id(2);		\
+		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
+		     });						\
+  });

 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }

@@ -442,6 +441,8 @@ inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
+inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts)
 }
 #endif

+#ifndef HOST_NAME_MAX
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+#endif
+
 NAMESPACE_BEGIN(Grid);

 //////////////////////////////////////////////////////
@@ -393,6 +397,9 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;

+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;

  /////////////////////////////////////////////////////////
  // Reporting
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -0,0 +1,968 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_usqcd.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/blas/BatchedBlas.h>
+
+using namespace Grid;
+
+std::vector<int> L_list;
+std::vector<int> Ls_list;
+std::vector<double> mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;
+
+FILE * FP;
+
+struct time_statistics{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v){
+      double sum = std::accumulate(v.begin(), v.end(), 0.0);
+      mean = sum / v.size();
+
+      std::vector<double> diff(v.size());
+      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+      auto result = std::minmax_element(v.begin(), v.end());
+      min = *result.first;
+      max = *result.second;
+}
+};
+
+void comms_header(){
+  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
+            <<"bytes\t MB/s uni  \t\t MB/s bidi "<<std::endl;
+};
+
+struct controls {
+  int Opt;
+  int CommsOverlap;
+  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
+};
+
+class Benchmark {
+public:
+  static void Decomposition (void ) {
+
+    int threads = GridThread::GetThreads();
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
+    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
+    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
+    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  }
+
+  static void Comms(void)
+  {
+    int Nloop=200;
+    int nmu=0;
+    int maxlat=32;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+
+    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
+    std::vector<double> t_time(Nloop);
+    time_statistics timestat;
+
+    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+    comms_header();
+
+    fprintf(FP,"Communications\n\n");
+    fprintf(FP,"Packet bytes, direction, GB/s per node\n");
+    for(int lat=16;lat<=maxlat;lat+=8){
+      //      for(int Ls=8;Ls<=8;Ls*=2){
+      { int Ls=12;
+
+	Coordinate latt_size  ({lat*mpi_layout[0],
+	      lat*mpi_layout[1],
+	      lat*mpi_layout[2],
+	      lat*mpi_layout[3]});
+
+	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+	RealD Nrank = Grid._Nprocessors;
+	RealD Nnode = Grid.NodeCount();
+	RealD ppn = Nrank/Nnode;
+
+	std::vector<HalfSpinColourVectorD *> xbuf(8);
+	std::vector<HalfSpinColourVectorD *> rbuf(8);
+	//Grid.ShmBufferFreeAll();
+	uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+	for(int d=0;d<8;d++){
+	  xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	  rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	}
+
+	//	int ncomm;
+	double dbytes;
+
+        for(int dir=0;dir<8;dir++) {
+	  int mu =dir % 4;
+	  if (mpi_layout[mu]>1 ) {
+
+	    std::vector<double> times(Nloop);
+	    for(int i=0;i<Nloop;i++){
+
+	      dbytes=0;	        
+	      double start=usecond();
+	      int xmit_to_rank;
+	      int recv_from_rank;
+
+	      if ( dir == mu ) { 
+		int comm_proc=1;
+		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      } else { 
+		int comm_proc = mpi_layout[mu]-1;
+		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      }
+	      Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+				  (void *)&rbuf[dir][0], recv_from_rank,
+				  bytes);
+	      dbytes+=bytes;
+	     
+	      double stop=usecond();
+	      t_time[i] = stop-start; // microseconds
+
+	    }
+	    timestat.statistics(t_time);
+	  
+	    dbytes=dbytes*ppn;
+	    double xbytes    = dbytes*0.5;
+	    double bidibytes = dbytes;
+	  
+	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
+		     << bytes << " \t "
+		     <<xbytes/timestat.mean
+		     << "\t\t"
+		     << bidibytes/timestat.mean<< std::endl;
+	    fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
+	  }
+	}
+	for(int d=0;d<8;d++){
+	  acceleratorFreeDevice(xbuf[d]);
+	  acceleratorFreeDevice(rbuf[d]);
+	}
+      }
+    }
+    fprintf(FP,"\n\n");
+    
+    return;
+  }
+
+  
+  static void Memory(void)
+  {
+    const int Nvec=8;
+    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+    typedef iVector<vReal,Nvec> Vec;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+
+    fprintf(FP,"Memory Bandwidth\n\n");
+    fprintf(FP,"Bytes, GB/s per node\n");
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+    //    uint64_t NP;
+    uint64_t NN;
+
+
+  uint64_t lmax=40;
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+
+    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    for(int lat=8;lat<=lmax;lat+=8){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      //      NP= Grid.RankCount();
+      NN =Grid.NodeCount();
+
+      Vec rn ; random(sRNG,rn);
+
+      LatticeVec z(&Grid); z=Zero();
+      LatticeVec x(&Grid); x=Zero();
+      LatticeVec y(&Grid); y=Zero();
+      double a=2.0;
+
+      uint64_t Nloop=NLOOP;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=a*x-y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000;
+     
+      double flops=vol*Nvec*2;// mul,add
+      double bytes=3.0*vol*Nvec*sizeof(Real);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
+	       << "\t\t"<< bytes/time/NN <<std::endl;
+
+      fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
+
+    }
+    fprintf(FP,"\n\n");
+  };
+
+
+  static void BLAS(void)
+  {
+    //int nbasis, int nrhs, int coarseVol
+    int  basis[] = { 16,32,64 };
+    int  rhs[]   = { 8,16,32 };
+    int  vol  = 4*4*4*4;
+
+    GridBLAS blas;
+    
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
+
+    for(int b=0;b<3;b++){
+    for(int r=0;r<3;r++){
+      int M=basis[b];
+      int N=rhs[r];
+      int K=basis[b];
+      int BATCH=vol;
+      double p=blas.benchmark(M,N,K,BATCH);
+
+      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
+      
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
+    }}
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+    for(int b=0;b<3;b++){
+    for(int r=0;r<3;r++){
+      int M=basis[b];
+      int N=rhs[r];
+      int K=vol;
+      int BATCH=vol;
+      double p=blas.benchmark(M,N,K,BATCH);
+
+      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
+    }}
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+    for(int b=0;b<3;b++){
+    for(int r=0;r<3;r++){
+      int M=rhs[r];
+      int N=vol;
+      int K=basis[b];
+      int BATCH=vol;
+      double p=blas.benchmark(M,N,K,BATCH);
+
+      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
+    }}
+    fprintf(FP,"\n\n\n");
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  };
+  
+
+  static void SU4(void)
+  {
+    const int Nc4=4;
+    typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+    
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+    uint64_t NN;
+
+
+    uint64_t lmax=32;
+
+    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    for(int lat=8;lat<=lmax;lat+=8){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      NN =Grid.NodeCount();
+
+
+      LatticeSU4 z(&Grid); z=Zero();
+      LatticeSU4 x(&Grid); x=Zero();
+      LatticeSU4 y(&Grid); y=Zero();
+      //      double a=2.0;
+
+      uint64_t Nloop=NLOOP;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000;
+     
+      double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
+      double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
+	       << "\t\t"<< bytes/time/NN <<std::endl;
+
+    }
+  };
+
+
+  static double DWF(int Ls,int L)
+  {
+    RealD mass=0.1;
+    RealD M5  =1.8;
+
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    std::vector<int> seeds5({5,6,7,8});
+    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    typedef DomainWallFermionF Action;
+    typedef typename Action::FermionField Fermion;
+    typedef LatticeGaugeFieldF Gauge;
+    
+    ///////// Source preparation ////////////
+    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
+    Fermion src   (FGrid); random(RNG5,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+
+#ifdef AVX512
+      const int num_cases = 3;
+#else 
+      const int num_cases = 2;
+#endif      
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+
+      controls Cases [] = {
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
+	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
+	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+
+	// Nc=3 gives
+	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
+	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
+	//	double flops=(1344.0*volume)/2;
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2;
+
+	double flops=(fps*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+
+	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    }
+    return mflops_best;
+  }
+
+
+  static double Staggered(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+    typedef ImprovedStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
+
+    typename Action::ImplParams params;
+    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+  
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+    
+      const int num_cases = 2;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1146.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
+
+  static double Clover(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark Clover on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD csw=1.0;
+
+    typedef WilsonCloverFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
+
+    Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion r     (FGrid);
+  
+    {
+
+      const int num_cases = 1;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Dc.M(src,r);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Dc.M(src,r);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1344+ 24+6*6*8*2)*volume;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
+};
+
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  if (GlobalSharedMemory::WorldRank==0) { 
+    FP = fopen("Benchmark_usqcd.csv","w");
+  } else {
+    FP = fopen("/dev/null","w");
+  }
+
+  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
+  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
+
+  Benchmark::Decomposition();
+
+  int do_su4=0;
+  int do_memory=1;
+  int do_comms =1;
+  int do_blas  =1;
+
+  int sel=4;
+  std::vector<int> L_list({8,12,16,24,32});
+  int selm1=sel-1;
+
+  std::vector<double> clover;
+  std::vector<double> dwf4;
+  std::vector<double> staggered;
+
+  int Ls=1;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    clover.push_back(Benchmark::DWF(1,L_list[l]));
+  }
+
+  Ls=12;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::DWF(Ls,L_list[l]) ;
+    dwf4.push_back(result);
+  }
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::Staggered(L_list[l]) ;
+    staggered.push_back(result);
+  }
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
+  }
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  int NN=NN_global;
+  if ( do_memory ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::Memory();
+  }
+
+  if ( do_blas ) {
+#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::BLAS();
+#endif
+  }
+
+  if ( do_su4 ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::SU4();
+  }
+  
+  if ( do_comms ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::Comms();
+  }
+
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
+    fprintf(FP,"Per node summary table\n");
+    fprintf(FP,"\n");
+    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
+    fprintf(FP,"\n");
+    for(int l=0;l<L_list.size();l++){
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
+      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
+    }
+    fprintf(FP,"\n");
+
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
+    std::cout<<std::setprecision(3);
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  Grid_finalize();
+  fclose(FP);
+}
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 set -e

-EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
-EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
+EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
+EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'


 echo "-- deploying Eigen source..."
-ARC=`basename ${EIGEN_URL}`
+ARC=$(basename ${EIGEN_URL})
 wget ${EIGEN_URL} --no-check-certificate
 if command -v sha256sum; then
   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
@@ -14,13 +14,8 @@ if command -v sha256sum; then
 else
   echo "WARNING: could not verify checksum, please install sha256sum" >&2
 fi
-./scripts/update_eigen.sh ${ARC}
-rm ${ARC}
-# patch for non-portable includes in Eigen 3.3.5
-# apparently already fixed in Eigen HEAD so it should not be 
-# a problem in the future (A.P.)
-patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
-
+./scripts/update_eigen.sh "${ARC}"
+rm "${ARC}"
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
--- a/scripts/eigen-3.3.5.Tensor.patch
+++ b/scripts/eigen-3.3.5.Tensor.patch
@@ -1,19 +0,0 @@
--- ./Eigen/unsupported/Eigen/CXX11/Tensor	2018-07-23 10:33:42.000000000 +0100
-+++ Tensor	2018-08-28 16:15:56.000000000 +0100
-@@ -25,7 +25,7 @@
- #include <utility>
- #endif
- 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
-+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
- 
- #include "../SpecialFunctions"
- #include "src/util/CXX11Meta.h"
-@@ -147,6 +147,6 @@
- 
- #include "src/Tensor/TensorIO.h"
- 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
-+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
- 
- //#endif // EIGEN_CXX11_TENSOR_MODULE
--- a/systems/Aurora/benchmarks/bench1024.pbs
+++ b/systems/Aurora/benchmarks/bench1024.pbs
@@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1

 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
+export FI_CXI_CQ_FILL_PERCENT=10
+export FI_CXI_DEFAULT_CQ_SIZE=262144
+#export FI_CXI_DEFAULT_CQ_SIZE=131072
+#export FI_CXI_CQ_FILL_PERCENT=20

 # 12 ppn, 32 nodes, 384 ranks
 #
@@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf.small
+$CMD | tee 1024node.dwf.small.cq

 CMD="mpiexec -np 12288 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-$CMD | tee 1024node.dwf
+$CMD | tee 1024node.dwf.cq


--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -17,6 +17,7 @@ source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1

+
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
@@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"

-$CMD 
+#$CMD 

 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"

+#$CMD 
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+$CMD 
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
 $CMD 
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
+	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"

--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -3,10 +3,24 @@
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22

+export FI_CXI_DEFAULT_CQ_SIZE=131072
+export FI_CXI_CQ_FILL_PERCENT=20
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
+
+#
+# -ftarget-register-alloc-mode=pvc:default 
+# -ftarget-register-alloc-mode=pvc:small
+# -ftarget-register-alloc-mode=pvc:large
+# -ftarget-register-alloc-mode=pvc:auto
+#
+
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
- 
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Aurora/tests/repro16.pbs
+++ b/systems/Aurora/tests/repro16.pbs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=16
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 16 nodes, 192 ranks
+CMD="mpiexec -np 192 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+$CMD 
--- a/systems/Aurora/tests/solver/stag16.pbs
+++ b/systems/Aurora/tests/solver/stag16.pbs
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=16
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../../sourceme.sh
+
+cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 16 nodes, 192 ranks
+CMD="mpiexec -np 192 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
+	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+$CMD 
--- a/systems/Booster/benchmarks/Benchmark_usqcd.csv
+++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv
@@ -0,0 +1,70 @@
+Memory Bandwidth
+
+Bytes, GB/s per node
+3145728, 225.900365
+50331648, 2858.859504
+254803968, 4145.556367
+805306368, 4905.772480
+1966080000, 4978.312557
+
+
+GEMM
+
+ M, N, K, BATCH, GF/s per rank
+16, 8, 16, 256, 1.713639
+16, 16, 16, 256, 288.268316
+16, 32, 16, 256, 597.053950
+32, 8, 32, 256, 557.382591
+32, 16, 32, 256, 1100.145311
+32, 32, 32, 256, 1885.080449
+64, 8, 64, 256, 1725.163599
+64, 16, 64, 256, 3389.336566
+64, 32, 64, 256, 4168.252422
+16, 8, 256, 256, 1326.262134
+16, 16, 256, 256, 2318.095475
+16, 32, 256, 256, 3555.436503
+32, 8, 256, 256, 1920.139170
+32, 16, 256, 256, 3486.174753
+32, 32, 256, 256, 5320.821724
+64, 8, 256, 256, 2539.597502
+64, 16, 256, 256, 5003.456775
+64, 32, 256, 256, 7837.531562
+8, 256, 16, 256, 1427.848170
+16, 256, 16, 256, 2222.147815
+32, 256, 16, 256, 2877.121715
+8, 256, 32, 256, 1922.890086
+16, 256, 32, 256, 3199.469082
+32, 256, 32, 256, 4845.405343
+8, 256, 64, 256, 2639.483343
+16, 256, 64, 256, 5012.800299
+32, 256, 64, 256, 7216.006882
+
+
+
+Communications
+
+Packet bytes, direction, GB/s per node
+4718592, 2, 206.570734
+4718592, 3, 207.501847
+4718592, 6, 189.730277
+4718592, 7, 204.301218
+15925248, 2, 307.882997
+15925248, 3, 287.901076
+15925248, 6, 295.603109
+15925248, 7, 300.682033
+37748736, 2, 331.740364
+37748736, 3, 338.610627
+37748736, 6, 332.580657
+37748736, 7, 336.336579
+
+
+Per node summary table
+
+L , Wilson, DWF4, Staggered, GF/s per node
+
+8 , 16, 1165, 10
+12 , 473, 4901, 163
+16 , 1436, 8464, 442
+24 , 4133, 10139, 1530
+32 , 5726, 11487, 2518
+
--- a/systems/Booster/config-command
+++ b/systems/Booster/config-command
@@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
    --enable-gen-simd-width=64 \
    --enable-shm=nvlink \
    --enable-accelerator=cuda \
+    --disable-gparity \
+    --disable-fermion-reps \
    --with-lime=$LIME \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"

--- a/systems/Booster/sourceme.sh
+++ b/systems/Booster/sourceme.sh
@@ -1,5 +1,5 @@
-module load GCC/9.3.0       
-module load  GMP/6.2.0   
-module load MPFR/4.1.0     
-module load OpenMPI/4.1.0rc1  
-module load CUDA/11.3
+module load GCC
+module load GMP
+module load MPFR
+module load OpenMPI
+module load CUDA
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@@ -16,7 +16,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
- LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
+ LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64  -lhipblas -lrocblas"



--- a/systems/PVC-OEM/setup.sh
+++ b/systems/PVC-OEM/setup.sh
@@ -1,3 +1,5 @@
 export https_proxy=http://proxy-chain.intel.com:911
 module load intel-release
 module load intel/mpich
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -1,4 +1,3 @@
-BREW=/opt/local/
-MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
+CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug 


--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -30,27 +30,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;

-template<class d>
-struct scal {
-  d internal;
-};
-
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
+#ifndef HOST_NAME_MAX
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+#endif

 int main (int argc, char ** argv)
 {
+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::string host(hostname);
+  
  Grid_init(&argc,&argv);

  const int Ls=12;

-  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
-
-  { 
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -92,7 +85,14 @@ int main (int argc, char ** argv)
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);

-  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
+  int nsecs=600;
+  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
+    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
+    GridCmdOptionInt(arg,nsecs);
+  }
+  
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
+
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
  double MdagMsiteflops = 1452; // Mobius (real coeffs)
@@ -101,7 +101,14 @@ int main (int argc, char ** argv)
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
-  for(int i=0;i<10;i++){
+
+  time_t start = time(NULL);
+
+  uint32_t csum, csumref;
+  csumref=0;
+  int iter=0;
+  do {
+    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
@@ -111,10 +118,28 @@ int main (int argc, char ** argv)
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
-  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
+
+    csum = crc(result_o);
+
+    if ( csumref == 0 ) {
+      csumref = csum;
+    } else {
+      if ( csum != csumref ) { 
+	std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
+	assert(0);
+      } else {
+	std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
+      }
+    }
+    iter ++;
+  } while (time(NULL) < (start + nsecs/2) );
+    
+  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<1;i++){
+  csumref=0;
+  int i=0;
+  do { 
+    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
@@ -122,46 +147,30 @@ int main (int argc, char ** argv)
    iters = CG.IterationsToComplete;
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
-    
+
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
-  }
-  
-  //  MemoryManager::Print();
+
+    csum = crc(result_o);
+
+    if ( csumref == 0 ) {
+      csumref = csum;
+    } else {
+      if ( csum != csumref ) { 
+	std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
+	assert(0);
+      } else {
+	std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
+      }
+    }
+    i++;
+  } while (time(NULL) < (start + nsecs) );

  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);

  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
-
-  #ifdef HAVE_LIME
-  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
+  assert(diff < 1e-4);
  
-  std::string file1("./Propagator1");
-  emptyUserRecord record;
-  uint32_t nersc_csum;
-  uint32_t scidac_csuma;
-  uint32_t scidac_csumb;
-  typedef SpinColourVectorD   FermionD;
-  typedef vSpinColourVectorD vFermionD;
-
-  BinarySimpleMunger<FermionD,FermionD> munge;
-  std::string format = getFormatString<vFermionD>();
-  
-  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-
-  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
-
-  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
-						   nersc_csum,scidac_csuma,scidac_csumb);
-
-  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
-  }
-  #endif
-  }
-  
-  MemoryManager::Print();
-
  Grid_finalize();
 }
--- a/tests/core/Test_sliceSum.cc
+++ b/tests/core/Test_sliceSum.cc
@@ -0,0 +1,321 @@
+#include <Grid/Grid.h>
+
+template<class vobj> inline void sliceSumCPU(const Grid::Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+{
+  using namespace Grid;
+  ///////////////////////////////////////////////////////
+  // FIXME precision promoted summation
+  // may be important for correlation functions
+  // But easily avoided by using double precision fields
+  ///////////////////////////////////////////////////////
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_object::scalar_type scalar_type;
+  GridBase  *grid = Data.Grid();
+  assert(grid!=NULL);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  Vector<vobj> lvSum(rd); // will locally sum vectors first
+  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node 
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  int ostride=grid->_ostride[orthogdim];
+  
+  //Reduce Data down to lvSum
+  sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+
+  for(int rt=0;rt<rd;rt++){
+
+    extract(lvSum[rt],extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx];
+
+    }
+  }
+  
+  // sum over nodes.
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      result[t]=lsSum[lt];
+    } else {
+      result[t]=Zero();
+    }
+
+  }
+  scalar_type * ptr = (scalar_type *) &result[0];
+  int words = fd*sizeof(sobj)/sizeof(scalar_type);
+  grid->GlobalSumVector(ptr, words);
+}
+
+
+int main (int argc, char ** argv) {
+    
+    using namespace Grid;
+
+    Grid_init(&argc,&argv);
+
+
+    Coordinate latt_size({64,64,64,16});
+    auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
+    auto mpi_layout = GridDefaultMpi();
+    GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+
+    std::vector<int> seeds({1, 2, 3, 4});
+
+    GridParallelRNG pRNG(&Grid);
+    pRNG.SeedFixedIntegers(seeds);
+
+    LatticeComplexD test_data(&Grid);
+    gaussian(pRNG,test_data);
+
+    std::vector<TComplexD> reduction_reference;
+    std::vector<TComplexD> reduction_result;
+
+    //warmup
+    for (int sweeps = 0; sweeps < 5; sweeps++) {
+      reduction_result = sliceSum(test_data,0);
+    }
+
+    int trace_id = traceStart("sliceSum benchmark - ComplexD");
+    std::cout << GridLogMessage << "Testing ComplexD" << std::endl;
+    std::cout << GridLogMessage << "sizeof(ComplexD) = " << sizeof(ComplexD) << std::endl;
+    std::cout << GridLogMessage << "sizeof(vComplexD) = " << sizeof(vComplexD) << std::endl;
+    for (int i = 0; i < Nd; i++) {
+
+      RealD t=-usecond();
+
+      tracePush("sliceSum");
+      sliceSumCPU(test_data,reduction_reference,i);
+      tracePop("sliceSum");
+
+      t+=usecond();
+      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
+      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
+      
+      
+      RealD tgpu=-usecond();
+
+      tracePush("sliceSumGpu");
+      reduction_result = sliceSum(test_data,i);
+      tracePop("sliceSumGpu");
+
+      tgpu+=usecond();
+
+      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
+
+
+      for(int t=0;t<reduction_reference.size();t++) {
+
+        auto diff = reduction_reference[t]-reduction_result[t];
+        assert(abs(TensorRemove(diff)) < 1e-8 );
+
+      }
+
+    
+    }
+    traceStop(trace_id);
+
+    LatticeSpinVectorD test_data_cv(&Grid);
+    gaussian(pRNG,test_data_cv);
+
+    std::vector<SpinVectorD> reduction_reference_cv;
+    std::vector<SpinVectorD> reduction_result_cv;
+
+    //warmup
+    for (int sweeps = 0; sweeps < 5; sweeps++) {
+      reduction_result_cv = sliceSum(test_data_cv,0);
+    }
+    trace_id = traceStart("sliceSum benchmark - SpinVectorD");
+
+    std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl;
+    std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl;
+    std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl;
+    for (int i = 0; i < Nd; i++) {
+
+      RealD t=-usecond();
+
+      tracePush("sliceSum");
+      sliceSumCPU(test_data_cv,reduction_reference_cv,i);
+      tracePop("sliceSum");
+
+      t+=usecond();
+      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
+      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
+      
+      
+      RealD tgpu=-usecond();
+
+      tracePush("sliceSumGpu");
+      reduction_result_cv = sliceSum(test_data_cv,i);
+      tracePop("sliceSumGpu");
+
+      tgpu+=usecond();
+
+      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
+
+
+      for(int t=0;t<reduction_reference_cv.size();t++) {
+
+        auto diff = reduction_reference_cv[t]-reduction_result_cv[t];
+        assert(abs(diff()(0)()) < 1e-8 );
+        assert(abs(diff()(1)()) < 1e-8 );
+        assert(abs(diff()(2)()) < 1e-8 );
+        assert(abs(diff()(3)()) < 1e-8 );
+
+      }
+
+    
+    }
+    traceStop(trace_id);
+
+    LatticeSpinColourVectorD test_data_scv(&Grid);
+    gaussian(pRNG,test_data_scv);
+
+    std::vector<SpinColourVectorD> reduction_reference_scv;
+    std::vector<SpinColourVectorD> reduction_result_scv;
+
+    //warmup
+    for (int sweeps = 0; sweeps < 5; sweeps++) {
+      reduction_result_scv = sliceSum(test_data_scv,0);
+    }
+    trace_id = traceStart("sliceSum benchmark - SpinColourVectorD");
+
+    std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl;
+    std::cout << GridLogMessage << "sizeof(SpinColourVectorD) = " << sizeof(SpinColourVectorD) << std::endl;
+    std::cout << GridLogMessage << "sizeof(vSpinColourVectorD) = " << sizeof(vSpinColourVectorD) << std::endl;
+    for (int i = 0; i < Nd; i++) {
+
+      RealD t=-usecond();
+
+      tracePush("sliceSum");
+      sliceSumCPU(test_data_scv,reduction_reference_scv,i);
+      tracePop("sliceSum");
+
+      t+=usecond();
+      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
+      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
+      
+      
+      RealD tgpu=-usecond();
+
+      tracePush("sliceSumGpu");
+      reduction_result_scv = sliceSum(test_data_scv,i);
+      tracePop("sliceSumGpu");
+
+      tgpu+=usecond();
+
+      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
+
+
+      for(int t=0;t<reduction_reference_scv.size();t++) {
+
+        auto diff = reduction_reference_scv[t]-reduction_result_scv[t];
+        // std::cout << diff <<std::endl;
+        assert(abs(diff()(0)(0)) < 1e-8 );
+        assert(abs(diff()(0)(1)) < 1e-8 );
+        assert(abs(diff()(0)(2)) < 1e-8 );
+        assert(abs(diff()(1)(0)) < 1e-8 );
+        assert(abs(diff()(1)(1)) < 1e-8 );
+        assert(abs(diff()(1)(2)) < 1e-8 );    
+        assert(abs(diff()(2)(0)) < 1e-8 );
+        assert(abs(diff()(2)(1)) < 1e-8 );
+        assert(abs(diff()(2)(2)) < 1e-8 );    
+        assert(abs(diff()(3)(0)) < 1e-8 );
+        assert(abs(diff()(3)(1)) < 1e-8 );
+        assert(abs(diff()(3)(2)) < 1e-8 );
+
+      }
+
+    
+    }
+    traceStop(trace_id);
+
+    LatticeSpinColourMatrixD test_data_scm(&Grid);
+    gaussian(pRNG,test_data_scm);
+
+    std::vector<SpinColourMatrixD> reduction_reference_scm;
+    std::vector<SpinColourMatrixD> reduction_result_scm;
+
+    //warmup
+    for (int sweeps = 0; sweeps < 5; sweeps++) {
+      reduction_result_scm = sliceSum(test_data_scm,0);
+    }
+    trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD");
+
+    std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl;
+    std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl;
+    std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl;
+    for (int i = 0; i < Nd; i++) {
+
+      RealD t=-usecond();
+
+      tracePush("sliceSum");
+      sliceSumCPU(test_data_scm,reduction_reference_scm,i);
+      tracePop("sliceSum");
+
+      t+=usecond();
+      std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
+      std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
+      
+      
+      RealD tgpu=-usecond();
+
+      tracePush("sliceSumGpu");
+      reduction_result_scm = sliceSum(test_data_scm,i);
+      tracePop("sliceSumGpu");
+
+      tgpu+=usecond();
+
+      std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
+
+
+      for(int t=0;t<reduction_reference_scm.size();t++) {
+
+        auto diff = reduction_reference_scm[t]-reduction_result_scm[t];
+        // std::cout << diff <<std::endl;
+        for (int is = 0; is < Ns; is++) {
+          for (int js = 0; js < Ns; js++) {
+            for (int ic = 0; ic < Nc; ic++) {
+              for (int jc = 0; jc < Nc; jc++) {
+                assert(abs(diff()(is,js)(ic,jc)) < 1e-8);
+              }
+            }
+          }
+        }
+
+      }
+
+    
+    }
+    traceStop(trace_id);
+
+    Grid_finalize();
+    return 0;
+}
Author	SHA1	Message	Date
Antonin Portelli	2b4399f8b1	more HOST_NAME_MAX fix	2024-03-07 15:26:01 +09:00
Antonin Portelli	f17b8de907	fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined	2024-03-07 15:22:08 +09:00
Peter Boyle	7e5bd46dd3	Booster update	2024-03-06 19:03:45 +01:00
Peter Boyle	228bbb9d81	Benchmark results	2024-03-06 19:03:35 +01:00
Peter Boyle	b812a7b4c6	Staggered launch script	2024-03-06 01:32:40 +00:00
Peter Boyle	891a366f73	Repro CG script	2024-03-06 01:22:55 +00:00
Peter Boyle	10116b3be8	Force device copyable and tell SYCL to shut it.	2024-03-06 01:13:27 +00:00
Peter Boyle	a46a0f0882	force device copyable and don't take crap from SYCL	2024-03-06 01:12:49 +00:00
Peter Boyle	a26a8a38f4	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-06 00:05:00 +00:00
Peter Boyle	7435315d50	More blasted shell variables	2024-03-06 00:03:59 +00:00
Peter Boyle	9b5f741e85	Reproducing CG can be more useful now	2024-03-06 00:03:16 +00:00
Peter Boyle	517822fdd2	SPR HBM benchmarking right and also PVC batched GEMM	2024-03-06 00:02:27 +00:00
Peter Boyle	1b93a9be88	Print out the hostname	2024-03-06 00:01:58 +00:00
Peter Boyle	783a66b348	Deterministic reduction please	2024-03-06 00:01:37 +00:00
Peter Boyle	976c3e9b59	Hack for flight logging CG inner products. Can be made to work, but could put in some more serious infrastructure for repro testing and blame attribution (Britney test) if necessary	2024-03-05 23:59:57 +00:00
Peter Boyle	f8ca971dae	Use of a bare PRECISION macro is not namespace safe and collides with SYCL	2024-03-05 23:59:13 +00:00
Peter Boyle	21bc8c24df	OneMKL batched blas starting	2024-03-05 23:58:20 +00:00
Peter Boyle	30228214f7	SYCL conflict with Eigen	2024-03-05 23:56:10 +00:00
Peter Boyle	2ae980ae43	Update sourceme.sh	2024-03-05 13:39:18 -05:00
Peter Boyle	6153dec2e4	Update setup.sh	2024-03-05 13:38:32 -05:00
Peter Boyle	c805f86343	USQCD benchmark	2024-03-01 00:05:04 -05:00
Peter Boyle	04ca065281	Only one rank opens	2024-02-29 20:09:11 -05:00
Peter Boyle	88d8fa43d7	Benchmark development	2024-02-29 20:01:44 -05:00
Peter Boyle	3c49762875	Propagate in the blas routine	2024-02-29 15:33:06 -05:00
Peter Boyle	436bf1d9d3	Merge pull request #455 from clarkedavida/hisq_fat_links Hisq fat links	2024-02-29 15:29:39 -05:00
david clarke	f70df6e195	changed NO_SHIFT and BACKWARD_CONST from define to enum	2024-02-29 12:29:30 -07:00
Peter Boyle	fce3852dff	Merge pull request #451 from paboyle/feature/eigen-3.4.0-update updating Eigen to 3.4.0	2024-02-28 18:03:37 -05:00
Peter Boyle	ee1b8bbdbd	Merge pull request #454 from edbennett/adjoint-broke fix HMC for non-fundamental representations	2024-02-28 14:05:27 -05:00
Peter Boyle	3f1636637d	Merge pull request #453 from dbollweg/feature/sliceSum_gpu Feature/slice sum gpu	2024-02-28 14:04:43 -05:00
Peter Boyle	2e570f5300	Merge pull request #457 from lehner/feature/gpt Import GPT-related updates	2024-02-28 13:59:04 -05:00
Christoph Lehner	9f89486df5	remove unnecessary code path	2024-02-28 19:56:23 +01:00
Christoph Lehner	22b43b86cb	Make GPT test suite work with SYCL	2024-02-28 12:57:17 +01:00
dbollweg	3c9012676a	CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case.	2024-02-27 12:41:45 -05:00
Dennis Bollweg	b507fe209c	Added SpinColourMatrix case to sliceSum Test	2024-02-27 11:28:32 -05:00
Dennis Bollweg	6cd2d8fcd5	Replace cuda/hip memcpy with Grid functions	2024-02-26 09:55:07 -05:00
dbollweg	0a816b5509	Merge branch 'feature/sliceSum_gpu' of https://github.com/dbollweg/Grid into feature/sliceSum_gpu	2024-02-22 21:43:06 -05:00
dbollweg	1c8b807c2e	free malloc'd memory	2024-02-22 21:42:44 -05:00
Christoph Lehner	66391f84f2	Merge branch 'feature/gpt' of ../Grid into develop	2024-02-21 19:05:00 +01:00
Ed Bennett	97f7a9ecb3	fix HMC for non-fundamental representations	2024-02-21 08:27:55 +00:00
Dennis Bollweg	15878f7613	sliceSumReduction_cub_large now also faster than CPU on Frontier	2024-02-16 13:55:21 -05:00
dbollweg	e0d5e3c6c7	Merge branch 'paboyle:develop' into feature/sliceSum_gpu	2024-02-16 13:16:37 -05:00
dbollweg	6f3455900e	Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs	2024-02-16 13:15:02 -05:00
Antonin Portelli	e4a641b64e	removing old Eigen tensor patch	2024-02-13 10:37:14 +01:00
Antonin Portelli	8849f187f1	updating Eigen to 3.4.0	2024-02-13 10:30:22 +01:00
dbollweg	b5659d106e	more test cases	2024-02-09 13:37:14 -05:00
dbollweg	4b43307402	Undo include path changes for level zero api header	2024-02-09 13:07:56 -05:00
dbollweg	09af8c25a2	Merge branch 'paboyle:develop' into feature/sliceSum_gpu	2024-02-09 13:02:59 -05:00
dbollweg	9514035b87	refactor slicesum: slicesum uses GPU version by default now	2024-02-09 13:02:28 -05:00
dbollweg	1514b4f137	slicesum_sycl passes test	2024-02-06 19:08:44 -05:00
dbollweg	ab2de131bd	work towards sliceSum for sycl backend	2024-02-06 13:24:45 -05:00
Dennis Bollweg	5af8da76d7	Fix cuda compilation of Lattice_slicesum_gpu.h	2024-02-01 18:02:30 -05:00
Dennis Bollweg	b8b9dc952d	Async memcpy's and cleanup	2024-02-01 17:55:35 -05:00
Dennis Bollweg	79a6ed32d8	Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies	2024-02-01 16:41:03 -05:00
dbollweg	caa5f97723	Add sliceSum gpu using cub/hipcub	2024-01-31 16:50:06 -05:00
Christoph Lehner	f2648e94b9	getHostPointer added to Lattice	2023-10-23 13:47:41 +02:00
Christoph Lehner	e6ed516052	merged	2023-10-08 09:00:37 +02:00
Christoph Lehner	e2a3dae1f2	Option for multiple simultaneous CartesianStencils	2023-10-08 08:58:44 +02:00
Christoph Lehner	452bf2e907	Accelerator basisRotate also on HIP	2023-06-20 20:36:24 +03:00
Christoph Lehner	e8c29e2fe5	Merge pull request #31 from paboyle/develop Sync	2023-05-28 16:13:12 +02:00
Christoph Lehner	da9cbfc7cc	Suppress BuildSurfaceList verbosity in Stencil.h	2023-05-19 20:22:20 +02:00
Christoph Lehner	6b9f07c1ed	Merge pull request #30 from paboyle/develop Merge upstream	2023-05-19 20:20:58 +02:00
Christoph Lehner	5f75735dab	Add M and Mdag to WilsonTMFermion	2023-04-06 18:25:05 +02:00