Optimise lie algebra project

Clean up the accelerator pick/set checkerboard
Attempt at operating on half checkerboard
2025-10-30 19:44:32 +00:00 · 2024-09-19 15:48:09 -04:00 · 2024-08-23 12:34:41 -04:00 · 2024-08-23 11:05:09 -04:00 · 2024-08-20 16:18:43 +00:00 · 2024-08-20 14:33:09 +00:00
46 changed files with 3662 additions and 727 deletions
--- a/BLAS_benchmark/BatchBlasBench.cc
+++ b/BLAS_benchmark/BatchBlasBench.cc
--- a/BLAS_benchmark/compile-command
+++ b/BLAS_benchmark/compile-command
@@ -0,0 +1,2 @@
+
+mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
--- a/Grid/Namespace.h
+++ b/Grid/Namespace.h
@@ -30,9 +30,14 @@ directory

 #include <type_traits>
 #include <cassert>
+#include <exception>

 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
 #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 #define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
 #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
+
+#define EXCEPTION_CHECK_BEGIN(A) try {
+#define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
+
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -89,9 +89,10 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::cpu_selector selector;
+      cl::sycl::gpu_selector selector;
      cl::sycl::device selectedDevice { selector };
-      gridblasHandle =new sycl::queue (selectedDevice);
+      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
    }
@@ -207,6 +208,9 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);

+    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    assert(OpB!=GridBLAS_OP_T);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -266,26 +270,130 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    // Need a default/reference implementation
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  ComplexD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (ComplexD *) &alpha_p[0],
+						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexD *) &beta_p[0],
+						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
+#if 0
+      // This code was used to check the mat mul on Sunspot/OneMKL
+      std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
+      std::vector<ComplexD> A(m*k);  // pointer list to matrices
+      std::vector<ComplexD> B(k*n);
+      std::vector<ComplexD> C(m*n);
+      //      int sda = lda*k;
+      //      int sdb = ldb*k;
+      //      int sdc = ldc*n;
+      std::cerr << " Checking the GEMM results "<<std::endl;
+      for (int p = 0; p < 1; ++p) {
+	ComplexD * Amk_p;  // pointer list to matrices
+	ComplexD * Bkn_p;  // pointer list to matrices
+	ComplexD * Cmn_p;  // pointer list to matrices
+	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
+	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
+	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
+	std::cerr << " p " << p << " copied pointers "<<std::endl;
+	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
+	std::cerr << " p " << p << " copied matrices "<<std::endl;
+	std::cerr << " C[0] "<<C[0]<<std::endl;
+	std::cerr << " A[0] "<<A[0]<<std::endl;
+	std::cerr << " B[0] "<<B[0]<<std::endl;
+	std::cerr << " m "<<m<<std::endl;
+	std::cerr << " n "<<n<<std::endl;
+	std::cerr << " k "<<k<<std::endl;
+	for (int mm = 0; mm < m; ++mm) {
+	  for (int nn = 0; nn < n; ++nn) {
+	    ComplexD c_mn(0.0);
+	    for (int kk = 0; kk < k; ++kk) {
+	      int idx_a, idx_b;
+	      //    int lda = m; // m x k column major
+	      //    int ldb = k; // k x n column major
+	      //    int ldc = m; // m x b column major
+	      if(OpA!=GridBLAS_OP_N) {
+		idx_a =kk + mm*lda;
+	      } else {
+		idx_a =mm + kk*lda;
+	      }
+	      if(OpB!=GridBLAS_OP_N) {
+		idx_b =nn + kk*ldb;
+	      } else {
+		idx_b =kk + nn*ldb;
+	      }
+	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
+
+	      ComplexD Ac = A[idx_a];
+	      ComplexD Bc = B[idx_b];
+	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
+	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
+	      
+	      c_mn += Ac*Bc;
+	    }
+	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
+	  }
 	}
      }
-    }
 #endif
-    //    synchronise();
+#endif
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+        });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else { 
+	assert(0);
+      }
+#endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@@ -306,6 +414,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

+    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    assert(OpB!=GridBLAS_OP_T);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -366,26 +477,69 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (ComplexF *) &alpha_p[0],
+						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexF *) &beta_p[0],
+						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    ComplexF alphaf(real(alpha),imag(alpha));
-    ComplexF betaf(real(beta),imag(beta));
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  ComplexF c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
-	}
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else { 
+	assert(0);
      }
-    }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
@@ -408,6 +562,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

+    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
+    assert(OpB!=GridBLAS_OP_C);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -467,24 +624,69 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (float *) &alpha_p[0],
+						  (const float **)&Amk[0], (const int64_t *)&lda64,
+						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
+						  (float *) &beta_p[0],
+						  (float **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
-	}
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  } );
+      } else { 
+	assert(0);
      }
-    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
@@ -495,7 +697,6 @@ public:
  ///////////////////////////////////////////////////////////////////////////
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
-
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
@@ -508,6 +709,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();

+    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
+    assert(OpB!=GridBLAS_OP_C);
+
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -568,160 +772,124 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    /*
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
-      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
-      onemkl::transpose::N,
-      onemkl::transpose::N,
-      &m64,&n64,&k64,
-      (double *) &alpha_p[0],
-      (double **)&Amk[0], lda,
-      (double **)&Bkn[0], ldb,
-      (double *) &beta_p[0],
-      (double **)&Cmn[0], ldc,
-      1,&batchCount64);
-     */
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+
+      oneapi::mkl::transpose iOpA;
+      oneapi::mkl::transpose iOpB;
+      
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &iOpA,
+						  &iOpB,
+						  &m64,&n64,&k64,
+						  (double *) &alpha_p[0],
+						  (const double **)&Amk[0], (const int64_t *)&lda64,
+						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
+						  (double *) &beta_p[0],
+						  (double **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
-    int sdb = ldb*k;
-    int sdc = ldc*n;
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k; ++kk)
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
-	}
+    // Need a default/reference implementation; use Eigen
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
+      } else { 
+	assert(0);
      }
-    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
-  
-
-  
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // Strided case used by benchmark, but generally unused in Grid
-  // Keep a code example in double complex, but don't generate the single and real variants for now
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  
-  void gemmStridedBatched(int m,int n, int k,
-			  ComplexD alpha,
-			  ComplexD* Amk,  // pointer list to matrices
-			  ComplexD* Bkn,
-			  ComplexD beta,
-			  ComplexD* Cmn,
-			  int batchCount)
-  {
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    int sda = m*k;
-    int sdb = k*n;
-    int sdc = m*n;
-    deviceVector<ComplexD> alpha_p(1);
-    deviceVector<ComplexD> beta_p(1);
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
-    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
-    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
-#ifdef GRID_HIP
-    auto err = hipblasZgemmStridedBatched(gridblasHandle,
-					  HIPBLAS_OP_N,
-					  HIPBLAS_OP_N,
-					  m,n,k,
-					  (hipblasDoubleComplex *) &alpha_p[0],
-					  (hipblasDoubleComplex *) Amk, lda, sda,
-					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
-					  (hipblasDoubleComplex *) &beta_p[0],
-					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
-					  batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    cublasZgemmStridedBatched(gridblasHandle,
-			      CUBLAS_OP_N,
-			      CUBLAS_OP_N,
-			      m,n,k,
-			      (cuDoubleComplex *) &alpha_p[0],
-			      (cuDoubleComplex *) Amk, lda, sda,
-			      (cuDoubleComplex *) Bkn, ldb, sdb,
-			      (cuDoubleComplex *) &beta_p[0],
-			      (cuDoubleComplex *) Cmn, ldc, sdc,
-			      batchCount);
-#endif
-#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
-    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-						oneapi::mkl::transpose::N,
-						oneapi::mkl::transpose::N,
-						m,n,k,
-						alpha,
-						(const ComplexD *)Amk,lda,sda,
-						(const ComplexD *)Bkn,ldb,sdb,
-						beta,
-						(ComplexD *)Cmn,ldc,sdc,
-						batchCount);
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
-     // Need a default/reference implementation
-     for (int p = 0; p < batchCount; ++p) {
-       for (int mm = 0; mm < m; ++mm) {
-	 for (int nn = 0; nn < n; ++nn) {
-	   ComplexD c_mn(0.0);
-	   for (int kk = 0; kk < k; ++kk)
-	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
-	 }
-       }
-     }
-#endif
-  }

+  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
    int32_t N_A = M*K*BATCH;
    int32_t N_B = K*N*BATCH;
    int32_t N_C = M*N*BATCH;
-    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
-    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
-    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
-    ComplexD alpha(1.0);
-    ComplexD beta (1.0);
+    deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
+    deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
+    deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
+    CComplex alpha(1.0);
+    CComplex beta (1.0);
    RealD flops = 8.0*M*N*K*BATCH;
-    int ncall=10;
+    int ncall=1000;
+    deviceVector<CComplex *> As(BATCH);
+    deviceVector<CComplex *> Bs(BATCH);
+    deviceVector<CComplex *> Cs(BATCH);
+    for(int b = 0 ; b < BATCH;b++) {
+      CComplex *ptr;
+      ptr = &A[b*M*K];      acceleratorPut(As[b],ptr);
+      ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr);
+      ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr);
+    }
+
+    // Warm up call
+    gemmBatched(M,N,K,
+		alpha,
+		As, // m x k 
+		Bs, // k x n
+		beta, 
+		Cs);
+    synchronise();
+
    RealD t0 = usecond();
    for(int i=0;i<ncall;i++){
-      gemmStridedBatched(M,N,K,
-			 alpha,
-			 &A[0], // m x k 
-			 &B[0], // k x n
-			 beta, 
-			 &C[0], // m x n
-			 BATCH);
+      gemmBatched(M,N,K,
+		  alpha,
+		  As, // m x k 
+		  Bs, // k x n
+		  beta, 
+		  Cs);
+      synchronise();
    }
-    synchronise();
    RealD t1 = usecond();
-    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
+    RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
    flops = 8.0*M*N*K*BATCH*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }

-
-
-
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
@@ -279,11 +279,11 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
      diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
      _sort.push(eval2,Nm);
-      //      Glog << "#Ritz value before shift: "<< std::endl;
+      Glog << "#Ritz value before shift: "<< std::endl;
      for(int i=0; i<Nm; ++i){
-	//        std::cout.precision(13);
-	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	std::cout.precision(13);
+	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      
      //----------------------------------------------------------------------
@@ -297,7 +297,8 @@ public:
        
        unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);

-        for(int ip=Nk; ip<Nm; ++ip){ 
+        for(int ip=Nk; ip<Nm; ++ip){
+	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
          shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
        }
        
@@ -325,7 +326,7 @@ public:
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
        _sort.push(eval2,Nk);
-	//        Glog << "#Ritz value after shift: "<< std::endl;
+	Glog << "#Ritz value after shift: "<< std::endl;
        for(int i=0; i<Nk; ++i){
 	  //          std::cout.precision(13);
 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@@ -467,10 +468,10 @@ public:
  
    // set initial vector
    for (int i=0; i<Nu; ++i) {
-      //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
+      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
      evec[i] = src[i];
      orthogonalize(evec[i],evec,i);
-      //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
+      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
    }
 //    exit(-43);
    
@@ -506,11 +507,11 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
      diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
      _sort.push(eval2,Nr);
-      //      Glog << "#Ritz value: "<< std::endl;
+      Glog << "#Ritz value: "<< std::endl;
      for(int i=0; i<Nr; ++i){
-	//        std::cout.precision(13);
-	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	std::cout.precision(13);
+	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      
      // Convergence test
@@ -570,6 +571,7 @@ public:
      Glog << fname + " NOT converged ; Summary :\n";
    } else {
      Glog << fname + " CONVERGED ; Summary :\n";
+      Nstop = Nconv_guess; // Just take them all
      // Sort convered eigenpairs.
      std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating

@@ -642,7 +644,7 @@ private:
      //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
      k_start +=mrhs;
    }
-    //    Glog << "LinAlg "<< std::endl;
+    Glog << "LinAlg "<< std::endl;
    
    if (b>0) {
      for (int u=0; u<Nu; ++u) {
@@ -676,7 +678,7 @@ private:
      }
      w_copy[u] = w[u];
    }
-    //    Glog << "LinAlg done"<< std::endl;
+    Glog << "LinAlg done"<< std::endl;
    
    // In block version, the steps 6 and 7 in Lanczos construction is
    // replaced by the QR decomposition of new basis block.
@@ -689,15 +691,15 @@ private:
    }

    // re-orthogonalization for numerical stability
-    //    Glog << "Gram Schmidt"<< std::endl;
+    Glog << "Gram Schmidt"<< std::endl;
    orthogonalize(w,Nu,evec,R);
    // QR part
    for (int u=1; u<Nu; ++u) {
      orthogonalize(w[u],w,u);
    }
-    //    Glog << "Gram Schmidt done "<< std::endl;
+    Glog << "Gram Schmidt done "<< std::endl;
    
-    //    Glog << "LinAlg "<< std::endl;
+    Glog << "LinAlg "<< std::endl;
    for (int u=0; u<Nu; ++u) {
      //for (int v=0; v<Nu; ++v) {
      for (int v=u; v<Nu; ++v) {
@@ -714,7 +716,7 @@ private:
 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
      }
    }
-    //    Glog << "LinAlg done "<< std::endl;
+    Glog << "LinAlg done "<< std::endl;

    if (b < Nm/Nu-1) {
      for (int u=0; u<Nu; ++u) {
@@ -779,7 +781,7 @@ private:
    
    for ( int u=0; u<Nu; ++u ) {
      for (int k=0; k<Nk; ++k ) {
-//        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
+	//	Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
        BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
      }
    }
@@ -933,7 +935,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
+    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    M = Eigen::MatrixXcd::Zero(Nk,Nk);
@@ -951,7 +953,7 @@ if (1){
        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
      }
    }
-    //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; 
+    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
  }
 

@@ -961,7 +963,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
+    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    
@@ -977,7 +979,7 @@ if (1){
        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
      }
    }
-    //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; 
+    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
  }


@@ -986,7 +988,7 @@ if (1){
 		            RealD Dsh,
 		            Eigen::MatrixXcd& Qprod)
  {
-    //Glog << "shiftedQRDecompEigen() begin" << '\n'; 
+    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1002,6 +1004,7 @@ if (1){
                        // lower triangular part used to represent series
                        // of Q sequence.

+    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
    // equivalent operation of Qprod *= Q
    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
    
@@ -1022,6 +1025,7 @@ if (1){
    
    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);

+    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=0; j<Nm-(Nu+1); ++j) {
        for (int k=0; k<Nu+1+j; ++k) {
@@ -1029,6 +1033,7 @@ if (1){
        }
      }
    }
+    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=Nm-(Nu+1); j<Nm; ++j) {
        for (int k=0; k<Nm; ++k) {
@@ -1036,6 +1041,7 @@ if (1){
        }
      }
    }
+    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
    
    //static int ntimes = 2;
    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1061,11 +1067,13 @@ if (1){
        Mtmp(j,i) = conj(Mtmp(i,j));
      }
    }
+    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 

    for (int i=0; i<Nm; ++i) {
      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
    }
    
+    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
    M = Mtmp;

    //M = Q.adjoint()*(M*Q);
@@ -1077,7 +1085,7 @@ if (1){
    //  }
    //}
    
-    //Glog << "shiftedQRDecompEigen() end" << endl; 
+    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
  }

  void exampleQRDecompEigen(void)
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -499,6 +499,87 @@ namespace Grid {
      }
  };

+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, left preconditioned by Mee^inv
+  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
+  //
+  // Solve:
+  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
+  //
+  // Old notation e<->o
+  //
+  // Left precon by Moo^-1
+  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
+  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+      const bool _solnAsInitGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      Mtmp=src_o-Mtmp;                 
+      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
+      
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -54,6 +54,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -100,6 +103,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -145,6 +151,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
+    if ( (_Tp*)ptr == (_Tp *) NULL ) {
+      printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
+    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
+
+#if defined(__has_feature)
+#if __has_feature(leak_sanitizer)
+#define ASAN_LEAK_CHECK
+#endif
+#endif
+
+#ifdef ASAN_LEAK_CHECK
+#include <sanitizer/asan_interface.h>
+#include <sanitizer/common_interface_defs.h>
+#include <sanitizer/lsan_interface.h>
+#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
+#else
+#define LEAK_CHECK(A) { }
+#endif
+
+void MemoryManager::DisplayMallinfo(void)
+{
+#ifdef __linux__
+  struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
+  
+  mi = mallinfo();
+
+  std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl;
+  std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl;
+  std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl;
+  std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl;
+  std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl;
+  std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl;
+  std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
+  std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl;
+  std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl;
+  std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl;
+#endif
+  LEAK_CHECK();
+ 
+}
+
 void MemoryManager::PrintBytes(void)
 {
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
-  
+  DisplayMallinfo();
 }

 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -211,6 +211,7 @@ private:
 #endif

 public:
+  static void DisplayMallinfo(void);
  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -91,6 +91,7 @@ public:
  ////////////////////////////////////////////////////////////////
  virtual int CheckerBoarded(int dim)=0;
  virtual int CheckerBoard(const Coordinate &site)=0;
+  virtual int CheckerDim(void){ return 0; };
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
  virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@@ -60,6 +60,7 @@ public:
  int              _checker_dim;
  std::vector<int> _checker_board;

+  virtual int CheckerDim(void){ return _checker_dim; };
  virtual int CheckerBoarded(int dim){
    if( dim==_checker_dim) return 1;
    else return 0;
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,11 +236,18 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
+#if 1
+    auto me  = View(CpuWrite);
+    thread_for(ss,me.size(),{
+       me[ss]= r;
+      });
+#else    
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 	auto stmp=coalescedRead(vtmp);
 	coalescedWrite(me[ss],stmp);
    });
+#endif    
    me.ViewClose();
    return *this;
  }
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  
  // Might make all code paths go this way.
-#if 0
-  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
-  Vector<inner_t> inner_tmp(sites);
-  auto inner_tmp_v = &inner_tmp[0];
-  {
-    autoView( left_v , left, AcceleratorRead);
-    autoView( right_v,right, AcceleratorRead);
-    // This code could read coalesce
-    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
-    });
-  }
-#else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
    
  {
@@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
-#endif
  // This is in single precision and fails some tests
  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
@@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp;
+  inner_tmp.resize(sites);
  auto inner_tmp_v = &inner_tmp[0];

  accelerator_for( ss, sites, nsimd,{
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
-  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
+  static Vector<sobj> mysum;
+  mysum.resize(1);
+  sobj *mysum_p = & mysum[0];
  sobj identity; zeroit(identity);
+  mysum[0] = identity;
  sobj ret ; 

  Integer nsimd= vobj::Nsimd();
-  
+
+  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
+    auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
     cgh.parallel_for(cl::sycl::range<1>{osites},
 		      Reduction,
 		      [=] (cl::sycl::id<1> item, auto &sum) {
@@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
   });
  theGridAccelerator->wait();
  ret = mysum[0];
-  free(mysum,*theGridAccelerator);
+  //  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
@@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
  Word xorResult; xorResult = 0;
-  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
+  static Vector<Word> d_sum;
+  d_sum.resize(1);
+  Word *d_sum_p=&d_sum[0];
  Word identity;  identity=0;
+  d_sum[0] = identity;
+  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
+    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum ^=vec[index];
+	 sum^=vec[index];
     });
   });
  theGridAccelerator->wait();
  Word ret = d_sum[0];
-  free(d_sum,*theGridAccelerator);
+  //  free(d_sum,*theGridAccelerator);
  return ret;
 }

--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <type_traits>
+
 #if defined(GRID_CUDA)

 #include <cub/cub.cuh>
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  

 }
+#endif 

-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+
+#if defined(GRID_SYCL)
+template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+{
+  size_t subvol_size = e1*e2;
+
+  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
+  vobj vobj_zero;
+  zeroit(vobj_zero);
+  for (int r = 0; r<rd; r++) { 
+    mysum[r] = vobj_zero; 
+  }
+
+  commVector<vobj> reduction_buffer(rd*subvol_size);    
+
+  auto rb_p = &reduction_buffer[0];
+
+  // autoView(Data_v, Data, AcceleratorRead);
+
+  //prepare reduction buffer 
+  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
+  
+      int n = s / e2;
+      int b = s % e2;
+      int so=r*ostride; // base offset for start of plane 
+      int ss= so+n*stride+b;
+
+      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
+
+  });
+
+  for (int r = 0; r < rd; r++) {
+      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+          Reduction,
+          [=](cl::sycl::id<1> item, auto &sum) {
+              auto s = item[0];
+              sum += rb_p[r*subvol_size+s];
+          });
+      });
+      
+     
+  }
+  theGridAccelerator->wait();
+  for (int r = 0; r < rd; r++) {
+    lvSum[r] = mysum[r];
+  }
+  free(mysum,*theGridAccelerator);
+}
+#endif
+
+template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
@@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 	    buf[ss] = dat[ss*words+w];
    });

-    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
-      
+    #if defined(GRID_CUDA) || defined(GRID_HIP)
+      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #elif defined(GRID_SYCL)
+      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #endif
+
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
  
 }

-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
+  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
-      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+
+      #if defined(GRID_CUDA) || defined(GRID_HIP)
+        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #elif defined (GRID_SYCL)
+        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #endif
+
    }
    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
-#endif


-#if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
-{
-  typedef typename vobj::scalar_object sobj;
-  size_t subvol_size = e1*e2;
-
-  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
-  vobj vobj_zero;
-  zeroit(vobj_zero);
-    
-  commVector<vobj> reduction_buffer(rd*subvol_size);    
-
-  auto rb_p = &reduction_buffer[0];
-
-  autoView(Data_v, Data, AcceleratorRead);
-
-  //prepare reduction buffer 
-  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
-  
-      int n = s / e2;
-      int b = s % e2;
-      int so=r*ostride; // base offset for start of plane 
-      int ss= so+n*stride+b;
-
-      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
-
-  });
-
-  for (int r = 0; r < rd; r++) {
-      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
-          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
-              auto s = item[0];
-              sum += rb_p[r*subvol_size+s];
-          });
-      });
-      theGridAccelerator->wait();
-      lvSum[r] = mysum[0];
-  }
-  
-  free(mysum,*theGridAccelerator);
-}
-#endif
-
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data

 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
+  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
  
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #elif defined(GRID_SYCL)
-  
-  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
    assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); 
  }
 }
-
 
 ////////////////////////////////////////////////////////////////////////////////////////////
 // remove and insert a half checkerboard
 ////////////////////////////////////////////////////////////////////////////////////////////
+
 template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 {
-  half.Checkerboard() = cb;
-
-  autoView( half_v, half, CpuWrite);
-  autoView( full_v, full, CpuRead);
-  thread_for(ss, full.Grid()->oSites(),{
-    int cbos;
-    Coordinate coor;
-    full.Grid()->oCoorFromOindex(coor,ss);
-    cbos=half.Grid()->CheckerBoard(coor);
-
-    if (cbos==cb) {
-      int ssh=half.Grid()->oIndex(coor);
-      half_v[ssh] = full_v[ss];
-    }
-  });
+  acceleratorPickCheckerboard(cb,half,full);
 }
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
 {
-  int cb = half.Checkerboard();
-  autoView( half_v , half, CpuRead);
-  autoView( full_v , full, CpuWrite);
-  thread_for(ss,full.Grid()->oSites(),{
-
-    Coordinate coor;
-    int cbos;
-
-    full.Grid()->oCoorFromOindex(coor,ss);
-    cbos=half.Grid()->CheckerBoard(coor);
-      
-    if (cbos==cb) {
-      int ssh=half.Grid()->oIndex(coor);
-      full_v[ss]=half_v[ssh];
-    }
-  });
+  acceleratorSetCheckerboard(full,half);
 }

-template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
+template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
@@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
+  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    
    Coordinate coor;
@@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
    }
  });
 }
-template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
+template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
@@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
+  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{

    Coordinate coor;
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -90,6 +90,7 @@ public:
        exit(1);
      }
      Parameters.StartingType = arg;
+      std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
    }

    if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
@@ -97,6 +98,7 @@ public:
      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg, ivec);
      Parameters.StartTrajectory = ivec[0];
+      std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
    }

    if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
@@ -104,6 +106,7 @@ public:
      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg, ivec);
      Parameters.Trajectories = ivec[0];
+      std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
    }

    if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
@@ -111,6 +114,7 @@ public:
      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg, ivec);
      Parameters.NoMetropolisUntil = ivec[0];
+      std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
    }
    if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
      arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -137,9 +137,11 @@ public:

      double start_force = usecond();

+      MemoryManager::Print();
      as[level].actions.at(a)->deriv_timer_start();
      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();
+      MemoryManager::Print();

      auto name = as[level].actions.at(a)->action_name();

@@ -246,7 +248,11 @@ public:
    }
  };

-  virtual ~Integrator() {}
+  virtual ~Integrator()
+  {
+    // Pain in the ass to clean up the Level pointers
+    // Guido's design is at fault as per comment above in constructor
+  }

  virtual std::string integrator_name() = 0;
  
@@ -460,6 +466,7 @@ public:
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {

+	MemoryManager::Print();
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
@@ -468,6 +475,7 @@ public:
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
+	MemoryManager::Print();

      }
      as[level].apply(S_hireps, Representations, level, H);
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -32,7 +32,9 @@ private:
  //  Smear_Stout<Gimpl> *StoutSmearing;
  //  std::vector<GaugeField> SmearedSet;
  
+  GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
  std::vector<LatticeLorentzComplex> masks;
+  std::vector<int> cbs;

  typedef typename SU3Adjoint::AMatrix AdjMatrix;
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
@@ -147,6 +149,25 @@ private:
    }
    pokeLorentz(Fdet, Fdet_pol, nu);
  }
+
+  void Compute_MpInvJx_dNxxdSy(int cb,
+			       const GaugeLinkField &PlaqL,
+			       const GaugeLinkField &PlaqR,
+			       AdjMatrixField MpInvJx,
+			       AdjVectorField &Fdet2 )
+  {
+    GaugeLinkField PlaqLeo(UrbGrid);
+    GaugeLinkField PlaqReo(UrbGrid);
+    AdjMatrixField MpInvJxeo(UrbGrid);
+    AdjVectorField Fdet2eo(UrbGrid);
+    pickCheckerboard(cb,PlaqLeo,PlaqL);
+    pickCheckerboard(cb,PlaqReo,PlaqR);
+    pickCheckerboard(cb,MpInvJxeo,MpInvJx);
+    Fdet2eo.Checkerboard()=cb;
+    Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
+    setCheckerboard(Fdet2,Fdet2eo);
+  }
+  
  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
  {
    GaugeLinkField UtaU(PlaqL.Grid());
@@ -278,8 +299,9 @@ public:
    ////////////////////////////////////////////////////////////////////////////////
    // Mask the gauge field
    ////////////////////////////////////////////////////////////////////////////////
+    int cb = cbs[smr];
    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
-
+    
    Umsk = U;
    ApplyMask(Umsk,smr);
    Utmp = peekLorentz(Umsk,mu);
@@ -442,7 +464,7 @@ public:
    AdjMatrixField MpInvJx_nu(grid);
    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor

-    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+    Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
    Fdet2_mu=FdetV;
    Fdet1_mu=Zero();
    
@@ -499,7 +521,7 @@ public:

 	time=-usecond();
 	PlaqR=(-1.0)*PlaqR;
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
 	Fdet2_nu = FdetV;
 	time+=usecond();
 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
@@ -520,7 +542,7 @@ public:
 	

 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	
 	///////////////// -ve nu /////////////////
@@ -539,7 +561,7 @@ public:
 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;

 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	
 	// x==
@@ -560,7 +582,7 @@ public:

 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;

 	/////////////////////////////////////////////////////////////////////
@@ -589,7 +611,7 @@ public:

 	MpInvJx_nu = Cshift(MpInvJx,nu,-1);

-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;

 	//  __
@@ -609,7 +631,7 @@ public:

 	MpInvJx_nu = Cshift(MpInvJx,nu,1);

-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
 	
      }
@@ -931,6 +953,10 @@ private:
 public:

  /* Standard constructor */
+  virtual ~SmearedConfigurationMasked()
+  {
+    delete UrbGrid;
+  }
  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
  {
@@ -939,7 +965,6 @@ public:
    // was resized in base class
    assert(this->SmearedSet.size()==Nsmear);
    
-    GridRedBlackCartesian * UrbGrid;
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
    LatticeComplex tmp(_UGrid);
@@ -947,10 +972,11 @@ public:
    for (unsigned int i = 0; i < this->smearingLevels; ++i) {

      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
-
      int mu= (i/2) %Nd;
      int cb= (i%2);
      LatticeComplex tmpcb(UrbGrid);
+
+      cbs.push_back(cb);
 	
      masks[i]=Zero();
      ////////////////////
@@ -962,7 +988,6 @@ public:
      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
 	
    }
-    delete UrbGrid;
  }
  
  virtual void smeared_force(GaugeField &SigmaTilde) 
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
  int hNNm1= NNm1/2;
  RealD sqrt_2 = sqrt(2.0);
  Complex ci(0.0,1.0);
-  for(int su2Index=0;su2Index<hNNm1;su2Index++){
-    int i1, i2;
-    su2SubGroupIndex(i1, i2, su2Index);
-    int ax = su2Index*2;
-    int ay = su2Index*2+1;
-    accelerator_for(ss,grid->oSites(),1,{
+
+  const int nsimd=  Matrix::Nsimd();
+  accelerator_for(ss,grid->oSites(),nsimd,{
+      for(int su2Index=0;su2Index<hNNm1;su2Index++){
+	int i1, i2;
+	su2SubGroupIndex(i1, i2, su2Index);
+	int ax = su2Index*2;
+	int ay = su2Index*2+1;
 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
 	// trace( Ta x Ci in)
 	// Bet I need to move to real part with mult by -i
-	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
-	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
-      });
-  }
-  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
-    int k = diagIndex + 1; // diagIndex starts from 0
-    int a = NNm1+diagIndex;
-    RealD scale = 1.0/sqrt(2.0*k*(k+1));
-    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
-	auto tmp = in_v[ss]()()(0,0);
+	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
+	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
+      }
+      for(int diagIndex=0;diagIndex<N-1;diagIndex++){
+	int k = diagIndex + 1; // diagIndex starts from 0
+	int a = NNm1+diagIndex;
+	RealD scale = 1.0/sqrt(2.0*k*(k+1));
+	auto tmp = in_v(ss)()()(0,0);
 	for(int i=1;i<k;i++){
-	  tmp=tmp+in_v[ss]()()(i,i);
+	  tmp=tmp+in_v(ss)()()(i,i);
 	}
-	tmp = tmp - in_v[ss]()()(k,k)*k;
-	out_v[ss]()()(a,b) =imag(tmp) * scale;
-      });
-    }
+	tmp = tmp - in_v(ss)()()(k,k)*k;
+	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
+      }
+    });
 }

  
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
 ////////////////////////////////////////////////////////////////////////
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));

  int spare = su2_index;
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -99,6 +99,8 @@ using std::log;
 using std::exp;
 using std::sin;
 using std::cos;
+using std::asin;
+using std::acos;


 accelerator_inline RealF    conjugate(const RealF  & r){ return r; }
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {

 NAMESPACE_END(Grid);

+
+#ifdef GRID_SYCL
+template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
+template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
+template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
+#endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize

 #endif

-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
-{
-  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
-  acceleratorCopySynchronise();
-}
-
 //////////////////////////////////////////////
 // CPU Target - No accelerator just thread instead
 //////////////////////////////////////////////
@@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)

 #undef GRID_SIMT

-
 inline void acceleratorMem(void)
 {
  /*
@@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void)
  return;
 }

+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
+{
+  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
+  acceleratorCopySynchronise();
+}
+
 template<class T> void acceleratorPut(T& dev,T&host)
 {
  acceleratorCopyToDevice(&host,&dev,sizeof(T));
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -0,0 +1,238 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: HMC/ComputeWilsonFlow.cc
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Shuhei Yamamoto <syamamoto@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+#include <string>
+
+namespace Grid{
+  struct WFParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
+            int, steps,
+            double, step_size,
+            int, meas_interval,
+	    double, maxTau, // for the adaptive algorithm
+	    int, meas_interval_density,
+	    std::string, path); 
+       
+
+    template <class ReaderClass >
+    WFParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "WilsonFlow", *this);
+    }
+
+  };
+
+  struct ConfParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters,
+	   std::string, conf_path,
+           std::string, conf_prefix,
+	   std::string, conf_smr_prefix,
+           std::string, rng_prefix,
+	   int, StartConfiguration,
+	   int, EndConfiguration,
+           int, Skip);
+  
+    template <class ReaderClass >
+    ConfParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "Configurations", *this);
+    }
+
+  };
+}
+
+template <class T> void writeFile(T& in, std::string const fname){  
+#ifdef HAVE_LIME
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  
+  Grid_init(&argc, &argv);
+  GridLogLayout();
+
+  auto latt_size   = GridDefaultLatt();
+  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
+  auto mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size, simd_layout, mpi_layout);
+  
+  std::vector<int> seeds({1, 2, 3, 4, 5});
+  GridSerialRNG sRNG;
+  GridParallelRNG pRNG(&Grid);
+  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(&Grid), Uflow(&Grid);
+  
+  typedef Grid::XmlReader       Serialiser;
+  Serialiser Reader("input.xml", false, "root");
+  WFParameters WFPar(Reader);
+  ConfParameters CPar(Reader);
+  CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix);
+  NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar);
+
+  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
+
+  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
+
+  std::cout << std::setprecision(15);
+  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
+  
+  std::string file_pre  = WFPar.path;
+  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
+
+  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
+  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
+    
+    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
+    typedef typename PeriodicGimplR::ComplexField ComplexField;
+    
+    assert(Nd == 4);
+
+    // NOTE:
+    // Ideally, turn the folloing into methods of the appropriate class
+    /////////////   Compute Energy Density via Clover Leaf    /////////////////////////////////////////////////
+    ///// Taken from qcd/smearing/WilsonFlow.h
+    //         For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3
+    //E = 1/2 tr( F_munu F_munu )
+    //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
+    //F_01 F_02 F_03   F_12 F_13  F_23
+    GaugeMat F(U.Grid());
+    //LatticeComplexD R(U.Grid());
+    ComplexField R(U.Grid());
+    R = Zero();
+  
+    for(int mu=0;mu<3;mu++){
+      for(int nu=mu+1;nu<4;nu++){
+	WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu);
+	R = R + trace(F*F);
+      }
+    }
+    R = (-1.0) * R;
+    
+    //// Taken from qcd/utils/WilsonLoops.h
+    
+    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
+    GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
+    WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir);
+    WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir);
+    WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir);
+
+    // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
+    GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
+    WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir);
+    WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir);
+    WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir);
+
+    double coeff = 8.0/(32.0*M_PI*M_PI);
+    ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
+    //ComplexField qfield Plq(U.Grid());
+    //WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U);
+    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
+    //Plq = coeff * Plq;
+
+    int tau = std::round(t);
+    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
+    writeFile(R,efile);
+    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
+    writeFile(qfield,tfile);
+
+    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
+    RealD T = real( sum(qfield) );
+    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
+    RealD E0 = real(peekSite(R,scoor));
+    RealD T0 = real(peekSite(qfield,scoor));
+    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
+    
+  });
+  
+  int t=WFPar.maxTau;
+  WF.smear(Uflow, Umu);
+
+  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
+  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
+  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
+  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
+
+  std::cout<< GridLogMessage << " Admissibility check:\n";
+  const double sp_adm = 0.067;                // admissible threshold
+  const double pl_adm = 1.0-sp_adm/Nc;
+  std::cout << GridLogMessage << "   (pl_adm =" << pl_adm << ")\n";
+
+  // Need min and reduce min for this function
+  //double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm));
+  double sp_ave = Nc*(1.0-WFlow_plaq);
+
+  //std::cout<< GridLogMessage << "   sp_max = "        << sp_max <<"\n";
+  std::cout<< GridLogMessage << "   sp_ave = "        << sp_ave <<"\n";
+  std::cout<< GridLogMessage << "   (sp_admissible = "<< sp_adm <<")\n";
+  //std::cout<< GridLogMessage << "   sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n";
+  std::cout<< GridLogMessage << "   sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n";
+  }
+  Grid_finalize();
+}  // main
+
+
+/*
+Input file example
+
+
+JSON
+
+{
+    "WilsonFlow":{
+	"steps": 200,
+	"step_size": 0.01,
+	"meas_interval": 50,
+  "maxTau": 2.0
+    },
+    "Configurations":{
+	"conf_prefix": "ckpoint_lat",
+	"rng_prefix": "ckpoint_rng",
+	"StartConfiguration": 3000,
+	"EndConfiguration": 3000,
+	"Skip": 5
+    }
+}
+
+
+*/
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -58,7 +58,7 @@ int main(int argc, char **argv) {
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
-  HMCparams.NoMetropolisUntil=  20;
+  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.MD = MD;
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 10;
+  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);

@@ -186,6 +186,8 @@ int main(int argc, char **argv) {

  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                     
+  TheHMC.initializeGaugeFieldAndRNGs(U);

  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
--- a/HMC/site_autocorrelation.cc
+++ b/HMC/site_autocorrelation.cc
@@ -30,11 +30,13 @@ directory
 #include <string>

 template <class T> void readFile(T& out, std::string const fname){
+#ifdef HAVE_LIME
  Grid::emptyUserRecord record;
  Grid::ScidacReader RD;
  RD.open(fname);
  RD.readScidacFieldRecord(out,record);
  RD.close();
+#endif
 }


--- a/HMC/site_plaquette.cc
+++ b/HMC/site_plaquette.cc
@@ -31,11 +31,13 @@ directory

 NAMESPACE_BEGIN(Grid);
 template <class T> void writeFile(T& out, std::string const fname){
+#ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(out.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
  WR.close();
+#endif
 }
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -261,23 +261,25 @@ public:
    fprintf(FP,"\n\n");
  };

-
+  template<class CComplex>
  static void BLAS(void)
  {
    //int nbasis, int nrhs, int coarseVol
    int  basis[] = { 16,32,64 };
-    int  rhs[]   = { 8,16,32 };
-    int  vol  = 4*4*4*4;
+    int  rhs[]   = { 8,12,16 };
+    int  vol  = 8*8*8*8;
+    int  blk  = 4*4*4*4;

    GridBLAS blas;
-    
+
+    int fpbits = sizeof(CComplex)*4;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
+    std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
-    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
+    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);

    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
@@ -285,7 +287,7 @@ public:
      int N=rhs[r];
      int K=basis[b];
      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
+      double p=blas.benchmark<CComplex>(M,N,K,BATCH);

      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      
@@ -299,9 +301,9 @@ public:
    for(int r=0;r<3;r++){
      int M=basis[b];
      int N=rhs[r];
-      int K=vol;
+      int K=blk;
      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
+      double p=blas.benchmark<CComplex>(M,N,K,BATCH);

      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
@@ -313,10 +315,10 @@ public:
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
      int M=rhs[r];
-      int N=vol;
+      int N=blk;
      int K=basis[b];
      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
+      double p=blas.benchmark<CComplex>(M,N,K,BATCH);

      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
@@ -867,6 +869,7 @@ int main (int argc, char ** argv)
  int do_memory=1;
  int do_comms =1;
  int do_blas  =1;
+  int do_dslash=1;

  int sel=4;
  std::vector<int> L_list({8,12,16,24,32});
@@ -877,6 +880,7 @@ int main (int argc, char ** argv)
  std::vector<double> staggered;

  int Ls=1;
+  if (do_dslash){
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -901,6 +905,7 @@ int main (int argc, char ** argv)
    staggered.push_back(result);
  }

+
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -909,8 +914,33 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  }

  int NN=NN_global;
+  if(do_dslash){
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
+    fprintf(FP,"Per node summary table\n");
+    fprintf(FP,"\n");
+    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
+    fprintf(FP,"\n");
+    for(int l=0;l<L_list.size();l++){
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
+      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
+    }
+    fprintf(FP,"\n");
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
+    std::cout<<std::setprecision(3);
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  }
+
+  
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
@@ -918,15 +948,6 @@ int main (int argc, char ** argv)
    Benchmark::Memory();
  }

-  if ( do_blas ) {
-#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::BLAS();
-#endif
-  }
-
  if ( do_su4 ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
@@ -941,28 +962,14 @@ int main (int argc, char ** argv)
    Benchmark::Comms();
  }

+  if ( do_blas ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
-    fprintf(FP,"Per node summary table\n");
-    fprintf(FP,"\n");
-    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
-    fprintf(FP,"\n");
-    for(int l=0;l<L_list.size();l++){
-      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
-      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
-    }
-    fprintf(FP,"\n");
-
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
-    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
-    std::cout<<std::setprecision(3);
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
+    Benchmark::BLAS<ComplexD>();
+    Benchmark::BLAS<ComplexF>();
+  }
+  
  Grid_finalize();
  fclose(FP);
 }
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,16 +1,18 @@

+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
+	--enable-debug \
 	--disable-gparity \
 	--disable-fermion-reps \
+	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
-	CXX=icpx \
-	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
-	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
+	CXX=icpx 

--- a/systems/Aurora/config-command-leak
+++ b/systems/Aurora/config-command-leak
@@ -0,0 +1,23 @@
+source ~/spack/share/spack/setup-env.sh 
+spack load c-lime
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
+export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
+
+../../configure \
+	--enable-debug \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--with-lime=$CLIME \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-accelerator-aware-mpi=yes\
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx \
+	LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl  -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host  -fsycl -fsanitize=leak "
+
--- a/systems/Aurora/config-command-sanitize
+++ b/systems/Aurora/config-command-sanitize
@@ -0,0 +1,22 @@
+# -fsycl-targets=spir64_gen -Xs\" -device pvc \"
+# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
+# -fsycl-targets=intel_gpu_pvc
+
+unset DEVICE
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-debug \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--with-lime=$CLIME \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-accelerator-aware-mpi=yes\
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx 
+
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -1,14 +1,22 @@
+source ~/spack/share/spack/setup-env.sh 
+spack load c-lime
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+#spack load libefence
+#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
+#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
+#spack load gperftools
+export TCMALLOC=/home/paboyle/gperftools/install
+export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
+export INTELGT_AUTO_ATTACH_DISABLE=1
+
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
-
-module load oneapi/release/2023.12.15.001
-
+#module load oneapi/release/2023.12.15.001
 #module use /soft/modulefiles
 #module load intel_compute_runtime/release/agama-devel-682.22

-export FI_CXI_DEFAULT_CQ_SIZE=131072
-export FI_CXI_CQ_FILL_PERCENT=20
-
-export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
+#export FI_CXI_DEFAULT_CQ_SIZE=131072
+#export FI_CXI_CQ_FILL_PERCENT=20
+#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"

 #
@@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 # -ftarget-register-alloc-mode=pvc:small
 # -ftarget-register-alloc-mode=pvc:large
 # -ftarget-register-alloc-mode=pvc:auto
-#
+#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1

 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
-#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128

+#source ~/spack/share/spack/setup-env.sh
+#spack load gperftools
+#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
+#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
+
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Frontier/benchmarks/Benchmark_usqcd.csv
+++ b/systems/Frontier/benchmarks/Benchmark_usqcd.csv
@@ -0,0 +1,76 @@
+Memory Bandwidth
+
+Bytes, GB/s per node
+6291456, 379.297050
+100663296, 3754.674992
+509607936, 6521.472413
+1610612736, 8513.456479
+3932160000, 9018.901766
+
+
+GEMM
+
+ M, N, K, BATCH, GF/s per rank
+16, 8, 16, 256, 0.564958
+16, 16, 16, 256, 243.148058
+16, 32, 16, 256, 440.346877
+32, 8, 32, 256, 439.194136
+32, 16, 32, 256, 847.334141
+32, 32, 32, 256, 1430.892623
+64, 8, 64, 256, 1242.756741
+64, 16, 64, 256, 2196.689493
+64, 32, 64, 256, 3697.458072
+16, 8, 256, 256, 899.582627
+16, 16, 256, 256, 1673.537756
+16, 32, 256, 256, 2959.597089
+32, 8, 256, 256, 1558.858630
+32, 16, 256, 256, 2864.839445
+32, 32, 256, 256, 4810.671254
+64, 8, 256, 256, 2386.092942
+64, 16, 256, 256, 4451.665937
+64, 32, 256, 256, 5942.124095
+8, 256, 16, 256, 799.867271
+16, 256, 16, 256, 1584.624888
+32, 256, 16, 256, 1949.422338
+8, 256, 32, 256, 1389.417474
+16, 256, 32, 256, 2668.344493
+32, 256, 32, 256, 3234.162120
+8, 256, 64, 256, 2150.925128
+16, 256, 64, 256, 4012.488132
+32, 256, 64, 256, 5154.785521
+
+
+
+Communications
+
+Packet bytes, direction, GB/s per node
+4718592, 1, 245.026198
+4718592, 2, 251.180996
+4718592, 3, 361.110977
+4718592, 5, 247.898447
+4718592, 6, 249.867523
+4718592, 7, 359.033061
+15925248, 1, 255.030946
+15925248, 2, 264.453890
+15925248, 3, 392.949183
+15925248, 5, 256.040644
+15925248, 6, 264.681896
+15925248, 7, 392.102622
+37748736, 1, 258.823333
+37748736, 2, 268.181577
+37748736, 3, 401.478191
+37748736, 5, 258.995363
+37748736, 6, 268.206586
+37748736, 7, 400.397611
+
+
+Per node summary table
+
+L , Wilson, DWF4, Staggered, GF/s per node
+
+8 , 155, 1386, 50
+12 , 694, 4208, 230
+16 , 1841, 6675, 609
+24 , 3934, 8573, 1641
+32 , 5083, 9771, 3086
+
--- a/systems/Frontier/benchmarks/Benchmark_usqcd.log
+++ b/systems/Frontier/benchmarks/Benchmark_usqcd.log
@@ -0,0 +1,702 @@
+RANK 1 using GPU 1
+RANK 5 using GPU 6
+RANK 0 using GPU 0
+RANK 2 using GPU 2
+RANK 3 using GPU 3
+RANK 6 using GPU 5
+RANK 7 using GPU 4
+RANK 4 using GPU 7
+world_rank 0 has 1 devices
+AcceleratorHipInit: ========================
+AcceleratorHipInit: Device Number    : 0
+AcceleratorHipInit: ========================
+AcceleratorHipInit: Device identifier: AMD Instinct MI250X
+AcceleratorHipInit:   totalGlobalMem: 68702699520 
+AcceleratorHipInit:   isMultiGpuBoard: 0 
+AcceleratorHipInit:   warpSize: 64 
+AcceleratorHipInit: using default device 
+AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding 
+AcceleratorHipInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:c1:00.0
+AcceleratorHipInit: ================================================
+SharedMemoryMpi:  World communicator of size 8
+SharedMemoryMpi:  Node  communicator of size 8
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host frontier01320
+Grid : Message : Requested 4294967296 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 54962159616 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using hipMalloc
+Grid : Message : 0.293720 s : ==================================================================================
+Grid : Message : 0.293790 s : = Grid is setup to use 1 threads
+Grid : Message : 0.293800 s : ==================================================================================
+Grid : Message : 0.293810 s : Grid Default Decomposition patterns
+Grid : Message : 0.293810 s : 	OpenMP threads : 1
+Grid : Message : 0.293820 s : 	MPI tasks      : 1 2 2 2 
+Grid : Message : 0.293870 s : 	vReal          : 512bits ; 1 2 2 2 
+Grid : Message : 0.293890 s : 	vRealF         : 512bits ; 2 2 2 2 
+Grid : Message : 0.293910 s : 	vRealD         : 512bits ; 1 2 2 2 
+Grid : Message : 0.293920 s : 	vComplex       : 512bits ; 1 1 2 2 
+Grid : Message : 0.293930 s : 	vComplexF      : 512bits ; 1 2 2 2 
+Grid : Message : 0.293960 s : 	vComplexD      : 512bits ; 1 1 2 2 
+Grid : Message : 0.293970 s : ==================================================================================
+Grid : Message : 0.293980 s : ==================================================================================
+Grid : Message : 0.293990 s :  Clover dslash 4D vectorised (temporarily Wilson)
+Grid : Message : 0.294000 s : ==================================================================================
+Grid : Message : 0.301330 s : ==================================================================================
+Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume 
+Grid : Message : 0.301370 s : * Nc             : 3
+Grid : Message : 0.301380 s : * Global volume  : 8 16 16 16 
+Grid : Message : 0.301410 s : * Ls             : 1
+Grid : Message : 0.301420 s : * ranks          : 8
+Grid : Message : 0.301430 s : * nodes          : 1
+Grid : Message : 0.301440 s : * ranks/node     : 8
+Grid : Message : 0.301450 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 0.301460 s : * Using 1 threads
+Grid : Message : 0.301470 s : ==================================================================================
+Grid : Message : 0.345030 s : Initialised RNGs
+Grid : Message : 0.158302 s : ==================================================================================
+Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute
+Grid : Message : 0.158312 s : * SINGLE precision 
+Grid : Message : 0.158313 s : ==================================================================================
+Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344
+Grid : Message : 0.240711 s : Deo mflop/s =   154914.0 (130.8) 139367.7-159565.9
+Grid : Message : 0.240715 s : Deo mflop/s per rank   19364.3
+Grid : Message : 0.240716 s : Deo mflop/s per node   154914.0
+Grid : Message : 0.240718 s : ==================================================================================
+Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels
+Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute
+Grid : Message : 0.240719 s : * SINGLE precision 
+Grid : Message : 0.240719 s : ==================================================================================
+Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 0.315033 s : Deo mflop/s =   151459.5 (142.0) 131856.9-157286.4
+Grid : Message : 0.315036 s : Deo mflop/s per rank   18932.4
+Grid : Message : 0.315037 s : Deo mflop/s per node   151459.5
+Grid : Message : 0.315038 s : ==================================================================================
+Grid : Message : 0.315040 s : 8^4 x 1 Deo Best  mflop/s        =   154914.0 ; 154914.0 per node 
+Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s        =   151459.5 ; 151459.5 per node 
+Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ; 
+Grid : Message : 0.315044 s : ==================================================================================
+Grid : Message : 0.316507 s : ==================================================================================
+Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume 
+Grid : Message : 0.316511 s : * Nc             : 3
+Grid : Message : 0.316512 s : * Global volume  : 12 24 24 24 
+Grid : Message : 0.316515 s : * Ls             : 1
+Grid : Message : 0.316516 s : * ranks          : 8
+Grid : Message : 0.316517 s : * nodes          : 1
+Grid : Message : 0.316518 s : * ranks/node     : 8
+Grid : Message : 0.316518 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 0.316519 s : * Using 1 threads
+Grid : Message : 0.316520 s : ==================================================================================
+Grid : Message : 0.327883 s : Initialised RNGs
+Grid : Message : 0.786395 s : ==================================================================================
+Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute
+Grid : Message : 0.786406 s : * SINGLE precision 
+Grid : Message : 0.786406 s : ==================================================================================
+Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 0.871659 s : Deo mflop/s =   684982.2 (632.4) 609162.5-714594.5
+Grid : Message : 0.871663 s : Deo mflop/s per rank   85622.8
+Grid : Message : 0.871664 s : Deo mflop/s per node   684982.2
+Grid : Message : 0.871665 s : ==================================================================================
+Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels
+Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute
+Grid : Message : 0.871665 s : * SINGLE precision 
+Grid : Message : 0.871665 s : ==================================================================================
+Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 0.953702 s : Deo mflop/s =   693556.6 (576.5) 663552.0-719204.7
+Grid : Message : 0.953705 s : Deo mflop/s per rank   86694.6
+Grid : Message : 0.953706 s : Deo mflop/s per node   693556.6
+Grid : Message : 0.953707 s : ==================================================================================
+Grid : Message : 0.953708 s : 12^4 x 1 Deo Best  mflop/s        =   693556.6 ; 693556.6 per node 
+Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s        =   684982.2 ; 684982.2 per node 
+Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ; 
+Grid : Message : 0.953713 s : ==================================================================================
+Grid : Message : 0.957609 s : ==================================================================================
+Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume 
+Grid : Message : 0.957614 s : * Nc             : 3
+Grid : Message : 0.957615 s : * Global volume  : 16 32 32 32 
+Grid : Message : 0.957620 s : * Ls             : 1
+Grid : Message : 0.957621 s : * ranks          : 8
+Grid : Message : 0.957622 s : * nodes          : 1
+Grid : Message : 0.957623 s : * ranks/node     : 8
+Grid : Message : 0.957623 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 0.957624 s : * Using 1 threads
+Grid : Message : 0.957625 s : ==================================================================================
+Grid : Message : 0.985828 s : Initialised RNGs
+Grid : Message : 2.379761 s : ==================================================================================
+Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute
+Grid : Message : 2.379774 s : * SINGLE precision 
+Grid : Message : 2.379775 s : ==================================================================================
+Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 2.486725 s : Deo mflop/s =   1803226.1 (1139.4) 1646362.3-1864135.1
+Grid : Message : 2.486729 s : Deo mflop/s per rank   225403.3
+Grid : Message : 2.486731 s : Deo mflop/s per node   1803226.1
+Grid : Message : 2.486732 s : ==================================================================================
+Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels
+Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute
+Grid : Message : 2.486732 s : * SINGLE precision 
+Grid : Message : 2.486732 s : ==================================================================================
+Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 2.584412 s : Deo mflop/s =   1840587.3 (1119.6) 1779401.7-1914791.0
+Grid : Message : 2.584415 s : Deo mflop/s per rank   230073.4
+Grid : Message : 2.584416 s : Deo mflop/s per node   1840587.3
+Grid : Message : 2.584417 s : ==================================================================================
+Grid : Message : 2.584418 s : 16^4 x 1 Deo Best  mflop/s        =   1840587.3 ; 1840587.3 per node 
+Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s        =   1803226.1 ; 1803226.1 per node 
+Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ; 
+Grid : Message : 2.584423 s : ==================================================================================
+Grid : Message : 2.592858 s : ==================================================================================
+Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume 
+Grid : Message : 2.592863 s : * Nc             : 3
+Grid : Message : 2.592864 s : * Global volume  : 24 48 48 48 
+Grid : Message : 2.592869 s : * Ls             : 1
+Grid : Message : 2.592870 s : * ranks          : 8
+Grid : Message : 2.592871 s : * nodes          : 1
+Grid : Message : 2.592872 s : * ranks/node     : 8
+Grid : Message : 2.592872 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 2.592873 s : * Using 1 threads
+Grid : Message : 2.592874 s : ==================================================================================
+Grid : Message : 2.715623 s : Initialised RNGs
+Grid : Message : 9.608838 s : ==================================================================================
+Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute
+Grid : Message : 9.608854 s : * SINGLE precision 
+Grid : Message : 9.608855 s : ==================================================================================
+Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 9.870309 s : Deo mflop/s =   3861903.3 (1708.9) 3511078.3-3937368.2
+Grid : Message : 9.870313 s : Deo mflop/s per rank   482737.9
+Grid : Message : 9.870314 s : Deo mflop/s per node   3861903.3
+Grid : Message : 9.870315 s : ==================================================================================
+Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels
+Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute
+Grid : Message : 9.870317 s : * SINGLE precision 
+Grid : Message : 9.870317 s : ==================================================================================
+Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 10.101624 s : Deo mflop/s =   3933599.5 (1412.7) 3835758.7-4008152.3
+Grid : Message : 10.101627 s : Deo mflop/s per rank   491699.9
+Grid : Message : 10.101628 s : Deo mflop/s per node   3933599.5
+Grid : Message : 10.101629 s : ==================================================================================
+Grid : Message : 10.101629 s : 24^4 x 1 Deo Best  mflop/s        =   3933599.5 ; 3933599.5 per node 
+Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s        =   3861903.3 ; 3861903.3 per node 
+Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ; 
+Grid : Message : 10.101634 s : ==================================================================================
+Grid : Message : 10.139642 s : ==================================================================================
+Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume 
+Grid : Message : 10.139653 s : * Nc             : 3
+Grid : Message : 10.139654 s : * Global volume  : 32 64 64 64 
+Grid : Message : 10.139661 s : * Ls             : 1
+Grid : Message : 10.139661 s : * ranks          : 8
+Grid : Message : 10.139662 s : * nodes          : 1
+Grid : Message : 10.139662 s : * ranks/node     : 8
+Grid : Message : 10.139662 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 10.139663 s : * Using 1 threads
+Grid : Message : 10.139663 s : ==================================================================================
+Grid : Message : 10.502161 s : Initialised RNGs
+Grid : Message : 32.211092 s : ==================================================================================
+Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute
+Grid : Message : 32.211109 s : * SINGLE precision 
+Grid : Message : 32.211110 s : ==================================================================================
+Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 32.841732 s : Deo mflop/s =   4988499.9 (2722.5) 4244837.8-5120022.3
+Grid : Message : 32.841736 s : Deo mflop/s per rank   623562.5
+Grid : Message : 32.841737 s : Deo mflop/s per node   4988499.9
+Grid : Message : 32.841738 s : ==================================================================================
+Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels
+Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute
+Grid : Message : 32.841740 s : * SINGLE precision 
+Grid : Message : 32.841740 s : ==================================================================================
+Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 33.407442 s : Deo mflop/s =   5082758.0 (1883.1) 4971027.0-5205119.6
+Grid : Message : 33.407446 s : Deo mflop/s per rank   635344.7
+Grid : Message : 33.407447 s : Deo mflop/s per node   5082758.0
+Grid : Message : 33.407448 s : ==================================================================================
+Grid : Message : 33.407448 s : 32^4 x 1 Deo Best  mflop/s        =   5082758.0 ; 5082758.0 per node 
+Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s        =   4988499.9 ; 4988499.9 per node 
+Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ; 
+Grid : Message : 33.407453 s : ==================================================================================
+Grid : Message : 33.506785 s : ==================================================================================
+Grid : Message : 33.506798 s :  Domain wall dslash 4D vectorised
+Grid : Message : 33.506799 s : ==================================================================================
+Grid : Message : 33.530686 s : ==================================================================================
+Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume 
+Grid : Message : 33.530690 s : * Nc             : 3
+Grid : Message : 33.530691 s : * Global volume  : 8 16 16 16 
+Grid : Message : 33.530698 s : * Ls             : 12
+Grid : Message : 33.530699 s : * ranks          : 8
+Grid : Message : 33.530700 s : * nodes          : 1
+Grid : Message : 33.530701 s : * ranks/node     : 8
+Grid : Message : 33.530702 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 33.530703 s : * Using 1 threads
+Grid : Message : 33.530704 s : ==================================================================================
+Grid : Message : 33.545465 s : Initialised RNGs
+Grid : Message : 33.752384 s : ==================================================================================
+Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute
+Grid : Message : 33.752399 s : * SINGLE precision 
+Grid : Message : 33.752400 s : ==================================================================================
+Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 33.851977 s : Deo mflop/s =   1383287.7 (849.8) 1321205.8-1420651.4
+Grid : Message : 33.851981 s : Deo mflop/s per rank   172911.0
+Grid : Message : 33.851983 s : Deo mflop/s per node   1383287.7
+Grid : Message : 33.851984 s : ==================================================================================
+Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels
+Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute
+Grid : Message : 33.851984 s : * SINGLE precision 
+Grid : Message : 33.851984 s : ==================================================================================
+Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 33.949240 s : Deo mflop/s =   1386335.8 (734.6) 1341325.6-1428330.6
+Grid : Message : 33.949243 s : Deo mflop/s per rank   173292.0
+Grid : Message : 33.949244 s : Deo mflop/s per node   1386335.8
+Grid : Message : 33.949245 s : ==================================================================================
+Grid : Message : 33.949245 s : 8^4 x 12 Deo Best  mflop/s        =   1386335.8 ; 1386335.8 per node 
+Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s        =   1383287.7 ; 1383287.7 per node 
+Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ; 
+Grid : Message : 33.949250 s : ==================================================================================
+Grid : Message : 33.952789 s : ==================================================================================
+Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume 
+Grid : Message : 33.952794 s : * Nc             : 3
+Grid : Message : 33.952795 s : * Global volume  : 12 24 24 24 
+Grid : Message : 33.952800 s : * Ls             : 12
+Grid : Message : 33.952801 s : * ranks          : 8
+Grid : Message : 33.952802 s : * nodes          : 1
+Grid : Message : 33.952803 s : * ranks/node     : 8
+Grid : Message : 33.952803 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 33.952804 s : * Using 1 threads
+Grid : Message : 33.952805 s : ==================================================================================
+Grid : Message : 34.362200 s : Initialised RNGs
+Grid : Message : 34.969821 s : ==================================================================================
+Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute
+Grid : Message : 34.969834 s : * SINGLE precision 
+Grid : Message : 34.969835 s : ==================================================================================
+Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 35.135558 s : Deo mflop/s =   4208495.6 (2165.0) 4053699.5-4315228.5
+Grid : Message : 35.135562 s : Deo mflop/s per rank   526062.0
+Grid : Message : 35.135563 s : Deo mflop/s per node   4208495.6
+Grid : Message : 35.135564 s : ==================================================================================
+Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels
+Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute
+Grid : Message : 35.135565 s : * SINGLE precision 
+Grid : Message : 35.135565 s : ==================================================================================
+Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 35.299715 s : Deo mflop/s =   4156968.7 (1450.2) 4053699.5-4219939.5
+Grid : Message : 35.299718 s : Deo mflop/s per rank   519621.1
+Grid : Message : 35.299719 s : Deo mflop/s per node   4156968.7
+Grid : Message : 35.299721 s : ==================================================================================
+Grid : Message : 35.299721 s : 12^4 x 12 Deo Best  mflop/s        =   4208495.6 ; 4208495.6 per node 
+Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s        =   4156968.7 ; 4156968.7 per node 
+Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ; 
+Grid : Message : 35.299726 s : ==================================================================================
+Grid : Message : 35.309687 s : ==================================================================================
+Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume 
+Grid : Message : 35.309694 s : * Nc             : 3
+Grid : Message : 35.309695 s : * Global volume  : 16 32 32 32 
+Grid : Message : 35.309701 s : * Ls             : 12
+Grid : Message : 35.309702 s : * ranks          : 8
+Grid : Message : 35.309703 s : * nodes          : 1
+Grid : Message : 35.309704 s : * ranks/node     : 8
+Grid : Message : 35.309704 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 35.309705 s : * Using 1 threads
+Grid : Message : 35.309706 s : ==================================================================================
+Grid : Message : 35.448780 s : Initialised RNGs
+Grid : Message : 38.468764 s : ==================================================================================
+Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute
+Grid : Message : 38.468779 s : * SINGLE precision 
+Grid : Message : 38.468780 s : ==================================================================================
+Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 38.801040 s : Deo mflop/s =   6674673.6 (2168.6) 6484445.4-6797200.1
+Grid : Message : 38.801044 s : Deo mflop/s per rank   834334.2
+Grid : Message : 38.801045 s : Deo mflop/s per node   6674673.6
+Grid : Message : 38.801046 s : ==================================================================================
+Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels
+Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute
+Grid : Message : 38.801049 s : * SINGLE precision 
+Grid : Message : 38.801049 s : ==================================================================================
+Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 39.129783 s : Deo mflop/s =   6560128.4 (2117.4) 6405846.1-6679081.3
+Grid : Message : 39.129786 s : Deo mflop/s per rank   820016.1
+Grid : Message : 39.129787 s : Deo mflop/s per node   6560128.4
+Grid : Message : 39.129788 s : ==================================================================================
+Grid : Message : 39.129788 s : 16^4 x 12 Deo Best  mflop/s        =   6674673.6 ; 6674673.6 per node 
+Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s        =   6560128.4 ; 6560128.4 per node 
+Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ; 
+Grid : Message : 39.129795 s : ==================================================================================
+Grid : Message : 39.161251 s : ==================================================================================
+Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume 
+Grid : Message : 39.161266 s : * Nc             : 3
+Grid : Message : 39.161267 s : * Global volume  : 24 48 48 48 
+Grid : Message : 39.161274 s : * Ls             : 12
+Grid : Message : 39.161275 s : * ranks          : 8
+Grid : Message : 39.161276 s : * nodes          : 1
+Grid : Message : 39.161277 s : * ranks/node     : 8
+Grid : Message : 39.161277 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 39.161278 s : * Using 1 threads
+Grid : Message : 39.161279 s : ==================================================================================
+Grid : Message : 39.911996 s : Initialised RNGs
+Grid : Message : 54.971914 s : ==================================================================================
+Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.971930 s : * SINGLE precision 
+Grid : Message : 54.971931 s : ==================================================================================
+Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 56.309462 s : Deo mflop/s =   8572660.7 (1374.9) 8483366.4-8644399.6
+Grid : Message : 56.309467 s : Deo mflop/s per rank   1071582.6
+Grid : Message : 56.309468 s : Deo mflop/s per node   8572660.7
+Grid : Message : 56.309469 s : ==================================================================================
+Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels
+Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute
+Grid : Message : 56.309473 s : * SINGLE precision 
+Grid : Message : 56.309474 s : ==================================================================================
+Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 57.640714 s : Deo mflop/s =   8200141.3 (1445.8) 8113545.6-8286307.9
+Grid : Message : 57.640717 s : Deo mflop/s per rank   1025017.7
+Grid : Message : 57.640718 s : Deo mflop/s per node   8200141.3
+Grid : Message : 57.640719 s : ==================================================================================
+Grid : Message : 57.640720 s : 24^4 x 12 Deo Best  mflop/s        =   8572660.7 ; 8572660.7 per node 
+Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s        =   8200141.3 ; 8200141.3 per node 
+Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ; 
+Grid : Message : 57.640727 s : ==================================================================================
+Grid : Message : 57.806175 s : ==================================================================================
+Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume 
+Grid : Message : 57.806191 s : * Nc             : 3
+Grid : Message : 57.806192 s : * Global volume  : 32 64 64 64 
+Grid : Message : 57.806200 s : * Ls             : 12
+Grid : Message : 57.806200 s : * ranks          : 8
+Grid : Message : 57.806200 s : * nodes          : 1
+Grid : Message : 57.806200 s : * ranks/node     : 8
+Grid : Message : 57.806200 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 57.806201 s : * Using 1 threads
+Grid : Message : 57.806201 s : ==================================================================================
+Grid : Message : 60.313153 s : Initialised RNGs
+Grid : Message : 107.830286 s : ==================================================================================
+Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute
+Grid : Message : 107.830308 s : * SINGLE precision 
+Grid : Message : 107.830309 s : ==================================================================================
+Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 111.479625 s : Deo mflop/s =   9771387.8 (1000.8) 9688589.9-9830800.0
+Grid : Message : 111.479629 s : Deo mflop/s per rank   1221423.5
+Grid : Message : 111.479630 s : Deo mflop/s per node   9771387.8
+Grid : Message : 111.479631 s : ==================================================================================
+Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels
+Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute
+Grid : Message : 111.479631 s : * SINGLE precision 
+Grid : Message : 111.479631 s : ==================================================================================
+Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0
+Grid : Message : 115.406573 s : Deo mflop/s =   8785297.3 (1739.6) 8628282.5-8911307.5
+Grid : Message : 115.406576 s : Deo mflop/s per rank   1098162.2
+Grid : Message : 115.406577 s : Deo mflop/s per node   8785297.3
+Grid : Message : 115.406578 s : ==================================================================================
+Grid : Message : 115.406578 s : 32^4 x 12 Deo Best  mflop/s        =   9771387.8 ; 9771387.8 per node 
+Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s        =   8785297.3 ; 8785297.3 per node 
+Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ; 
+Grid : Message : 115.406582 s : ==================================================================================
+Grid : Message : 115.918888 s : ==================================================================================
+Grid : Message : 115.918902 s :  Improved Staggered dslash 4D vectorised
+Grid : Message : 115.918903 s : ==================================================================================
+Grid : Message : 115.920344 s : ==================================================================================
+Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume 
+Grid : Message : 115.920347 s : * Global volume  : 8 16 16 16 
+Grid : Message : 115.920354 s : * ranks          : 8
+Grid : Message : 115.920355 s : * nodes          : 1
+Grid : Message : 115.920356 s : * ranks/node     : 8
+Grid : Message : 115.920357 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 115.920376 s : * Using 1 threads
+Grid : Message : 115.920377 s : ==================================================================================
+Grid : Message : 115.923522 s : Initialised RNGs
+Grid : Message : 116.904870 s : ==================================================================================
+Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels
+Grid : Message : 116.904960 s : * SINGLE precision 
+Grid : Message : 116.904970 s : ==================================================================================
+Grid : Message : 116.288979 s : Deo mflop/s =   49708.9 (22.9) 44075.3-50609.3
+Grid : Message : 116.289000 s : Deo mflop/s per rank   6213.6
+Grid : Message : 116.289002 s : Deo mflop/s per node   49708.9
+Grid : Message : 116.289003 s : ==================================================================================
+Grid : Message : 116.289004 s : * SINGLE precision 
+Grid : Message : 116.289005 s : ==================================================================================
+Grid : Message : 116.481632 s : Deo mflop/s =   49737.1 (13.5) 48517.0-50338.0
+Grid : Message : 116.481639 s : Deo mflop/s per rank   6217.1
+Grid : Message : 116.481640 s : Deo mflop/s per node   49737.1
+Grid : Message : 116.481641 s : ==================================================================================
+Grid : Message : 116.481642 s : 8^4  Deo Best  mflop/s        =   49737.1 ; 49737.1 per node 
+Grid : Message : 116.481644 s : 8^4  Deo Worst mflop/s        =   49708.9 ; 49708.9 per node 
+Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ; 
+Grid : Message : 116.481647 s : ==================================================================================
+Grid : Message : 116.483458 s : ==================================================================================
+Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume 
+Grid : Message : 116.483462 s : * Global volume  : 12 24 24 24 
+Grid : Message : 116.483465 s : * ranks          : 8
+Grid : Message : 116.483466 s : * nodes          : 1
+Grid : Message : 116.483466 s : * ranks/node     : 8
+Grid : Message : 116.483466 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 116.483467 s : * Using 1 threads
+Grid : Message : 116.483468 s : ==================================================================================
+Grid : Message : 116.489279 s : Initialised RNGs
+Grid : Message : 116.945016 s : ==================================================================================
+Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels
+Grid : Message : 116.945026 s : * SINGLE precision 
+Grid : Message : 116.945027 s : ==================================================================================
+Grid : Message : 117.159821 s : Deo mflop/s =   229778.4 (89.5) 223656.1-233547.5
+Grid : Message : 117.159835 s : Deo mflop/s per rank   28722.3
+Grid : Message : 117.159837 s : Deo mflop/s per node   229778.4
+Grid : Message : 117.159838 s : ==================================================================================
+Grid : Message : 117.159838 s : * SINGLE precision 
+Grid : Message : 117.159838 s : ==================================================================================
+Grid : Message : 117.371102 s : Deo mflop/s =   229516.6 (61.8) 225781.1-233547.5
+Grid : Message : 117.371109 s : Deo mflop/s per rank   28689.6
+Grid : Message : 117.371110 s : Deo mflop/s per node   229516.6
+Grid : Message : 117.371111 s : ==================================================================================
+Grid : Message : 117.371111 s : 12^4  Deo Best  mflop/s        =   229778.4 ; 229778.4 per node 
+Grid : Message : 117.371113 s : 12^4  Deo Worst mflop/s        =   229516.6 ; 229516.6 per node 
+Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ; 
+Grid : Message : 117.371116 s : ==================================================================================
+Grid : Message : 117.373669 s : ==================================================================================
+Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume 
+Grid : Message : 117.373674 s : * Global volume  : 16 32 32 32 
+Grid : Message : 117.373678 s : * ranks          : 8
+Grid : Message : 117.373679 s : * nodes          : 1
+Grid : Message : 117.373679 s : * ranks/node     : 8
+Grid : Message : 117.373679 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 117.373680 s : * Using 1 threads
+Grid : Message : 117.373681 s : ==================================================================================
+Grid : Message : 117.386495 s : Initialised RNGs
+Grid : Message : 118.755695 s : ==================================================================================
+Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels
+Grid : Message : 118.755707 s : * SINGLE precision 
+Grid : Message : 118.755708 s : ==================================================================================
+Grid : Message : 119.178990 s : Deo mflop/s =   608844.0 (126.1) 596065.5-615608.7
+Grid : Message : 119.179160 s : Deo mflop/s per rank   76105.5
+Grid : Message : 119.179180 s : Deo mflop/s per node   608844.0
+Grid : Message : 119.179190 s : ==================================================================================
+Grid : Message : 119.179200 s : * SINGLE precision 
+Grid : Message : 119.179200 s : ==================================================================================
+Grid : Message : 119.271093 s : Deo mflop/s =   605259.7 (188.7) 591372.1-614349.7
+Grid : Message : 119.271101 s : Deo mflop/s per rank   75657.5
+Grid : Message : 119.271103 s : Deo mflop/s per node   605259.7
+Grid : Message : 119.271104 s : ==================================================================================
+Grid : Message : 119.271105 s : 16^4  Deo Best  mflop/s        =   608844.0 ; 608844.0 per node 
+Grid : Message : 119.271107 s : 16^4  Deo Worst mflop/s        =   605259.7 ; 605259.7 per node 
+Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ; 
+Grid : Message : 119.271110 s : ==================================================================================
+Grid : Message : 119.275303 s : ==================================================================================
+Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume 
+Grid : Message : 119.275309 s : * Global volume  : 24 48 48 48 
+Grid : Message : 119.275315 s : * ranks          : 8
+Grid : Message : 119.275316 s : * nodes          : 1
+Grid : Message : 119.275317 s : * ranks/node     : 8
+Grid : Message : 119.275317 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 119.275318 s : * Using 1 threads
+Grid : Message : 119.275319 s : ==================================================================================
+Grid : Message : 119.328765 s : Initialised RNGs
+Grid : Message : 126.866160 s : ==================================================================================
+Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels
+Grid : Message : 126.866280 s : * SINGLE precision 
+Grid : Message : 126.866290 s : ==================================================================================
+Grid : Message : 126.604376 s : Deo mflop/s =   1641161.6 (335.5) 1619660.5-1663961.9
+Grid : Message : 126.604392 s : Deo mflop/s per rank   205145.2
+Grid : Message : 126.604394 s : Deo mflop/s per node   1641161.6
+Grid : Message : 126.604395 s : ==================================================================================
+Grid : Message : 126.604396 s : * SINGLE precision 
+Grid : Message : 126.604396 s : ==================================================================================
+Grid : Message : 127.829420 s : Deo mflop/s =   1620972.4 (344.9) 1602593.4-1644174.3
+Grid : Message : 127.829520 s : Deo mflop/s per rank   202621.6
+Grid : Message : 127.829530 s : Deo mflop/s per node   1620972.4
+Grid : Message : 127.829540 s : ==================================================================================
+Grid : Message : 127.829550 s : 24^4  Deo Best  mflop/s        =   1641161.6 ; 1641161.6 per node 
+Grid : Message : 127.829570 s : 24^4  Deo Worst mflop/s        =   1620972.4 ; 1620972.4 per node 
+Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ; 
+Grid : Message : 127.829600 s : ==================================================================================
+Grid : Message : 127.107891 s : ==================================================================================
+Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume 
+Grid : Message : 127.107904 s : * Global volume  : 32 64 64 64 
+Grid : Message : 127.107912 s : * ranks          : 8
+Grid : Message : 127.107913 s : * nodes          : 1
+Grid : Message : 127.107914 s : * ranks/node     : 8
+Grid : Message : 127.107914 s : * ranks geom     : 1 2 2 2 
+Grid : Message : 127.107915 s : * Using 1 threads
+Grid : Message : 127.107916 s : ==================================================================================
+Grid : Message : 127.257116 s : Initialised RNGs
+Grid : Message : 148.527930 s : ==================================================================================
+Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels
+Grid : Message : 148.527942 s : * SINGLE precision 
+Grid : Message : 148.527943 s : ==================================================================================
+Grid : Message : 149.401625 s : Deo mflop/s =   3085543.7 (956.0) 2934476.4-3115147.4
+Grid : Message : 149.401643 s : Deo mflop/s per rank   385693.0
+Grid : Message : 149.401645 s : Deo mflop/s per node   3085543.7
+Grid : Message : 149.401646 s : ==================================================================================
+Grid : Message : 149.401647 s : * SINGLE precision 
+Grid : Message : 149.401648 s : ==================================================================================
+Grid : Message : 150.204533 s : Deo mflop/s =   3053468.5 (343.9) 3030688.8-3077255.0
+Grid : Message : 150.204540 s : Deo mflop/s per rank   381683.6
+Grid : Message : 150.204541 s : Deo mflop/s per node   3053468.5
+Grid : Message : 150.204542 s : ==================================================================================
+Grid : Message : 150.204543 s : 32^4  Deo Best  mflop/s        =   3085543.7 ; 3085543.7 per node 
+Grid : Message : 150.204545 s : 32^4  Deo Worst mflop/s        =   3053468.5 ; 3053468.5 per node 
+Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
+Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ; 
+Grid : Message : 150.204548 s : ==================================================================================
+Grid : Message : 150.292848 s : ==================================================================================
+Grid : Message : 150.292864 s :  Summary table Ls=12
+Grid : Message : 150.292866 s : ==================================================================================
+Grid : Message : 150.292866 s : L 		 Clover 		 DWF4 		 Staggered
+Grid : Message : 150.292867 s : 8 		 154914.0 		 1386335.8 		 49737.1
+Grid : Message : 150.292880 s : 12 		 693556.6 		 4208495.6 		 229778.4
+Grid : Message : 150.292882 s : 16 		 1840587.3 		 6674673.6 		 608844.0
+Grid : Message : 150.292884 s : 24 		 3933599.5 		 8572660.7 		 1641161.6
+Grid : Message : 150.292886 s : 32 		 5082758.0 		 9771387.8 		 3085543.7
+Grid : Message : 150.292888 s : ==================================================================================
+Grid : Message : 150.292888 s : ==================================================================================
+Grid : Message : 150.292888 s :  Memory benchmark 
+Grid : Message : 150.292888 s : ==================================================================================
+Grid : Message : 150.295495 s : ==================================================================================
+Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth
+Grid : Message : 150.295498 s : ==================================================================================
+Grid : Message : 150.295499 s :   L  		bytes			GB/s		Gflop/s		 seconds		GB/s / node
+Grid : Message : 150.295500 s : ----------------------------------------------------------
+Grid : Message : 160.682233 s : 8		6291456.000   		379.297		31.608		10.367		379.297
+Grid : Message : 161.851979 s : 16		100663296.000   		3754.675		312.890		1.047		3754.675
+Grid : Message : 162.458098 s : 24		509607936.000   		6521.472		543.456		0.603		6521.472
+Grid : Message : 162.924116 s : 32		1610612736.000   		8513.456		709.455		0.462		8513.456
+Grid : Message : 163.363877 s : 40		3932160000.000   		9018.902		751.575		0.436		9018.902
+Grid : Message : 163.363976 s : ==================================================================================
+Grid : Message : 163.363978 s :  Batched BLAS benchmark 
+Grid : Message : 163.363979 s : ==================================================================================
+hipblasCreate
+Grid : Message : 163.364046 s : ==================================================================================
+Grid : Message : 163.364048 s : = batched GEMM (double precision) 
+Grid : Message : 163.364048 s : ==================================================================================
+Grid : Message : 163.364048 s :   M  		N			K		Gflop/s / rank (coarse mrhs)
+Grid : Message : 163.364049 s : ----------------------------------------------------------
+Grid : Message : 163.438476 s : 16		8		16		256		0.565
+Grid : Message : 163.438944 s : 16		16		16		256		243.148
+Grid : Message : 163.439501 s : 16		32		16		256		440.347
+Grid : Message : 163.440003 s : 32		8		32		256		439.194
+Grid : Message : 163.440463 s : 32		16		32		256		847.334
+Grid : Message : 163.441051 s : 32		32		32		256		1430.893
+Grid : Message : 163.441679 s : 64		8		64		256		1242.757
+Grid : Message : 163.442354 s : 64		16		64		256		2196.689
+Grid : Message : 163.443196 s : 64		32		64		256		3697.458
+Grid : Message : 163.443200 s : ----------------------------------------------------------
+Grid : Message : 163.443201 s :   M  		N			K		Gflop/s / rank (block project)
+Grid : Message : 163.443202 s : ----------------------------------------------------------
+Grid : Message : 163.444013 s : 16		8		256		256		899.583
+Grid : Message : 163.444933 s : 16		16		256		256		1673.538
+Grid : Message : 163.446013 s : 16		32		256		256		2959.597
+Grid : Message : 163.446951 s : 32		8		256		256		1558.859
+Grid : Message : 163.447970 s : 32		16		256		256		2864.839
+Grid : Message : 163.449240 s : 32		32		256		256		4810.671
+Grid : Message : 163.450524 s : 64		8		256		256		2386.093
+Grid : Message : 163.451877 s : 64		16		256		256		4451.666
+Grid : Message : 163.453806 s : 64		32		256		256		5942.124
+Grid : Message : 163.453809 s : ----------------------------------------------------------
+Grid : Message : 163.453810 s :   M  		N			K		Gflop/s / rank (block promote)
+Grid : Message : 163.453811 s : ----------------------------------------------------------
+Grid : Message : 163.454716 s : 8		256		16		256		799.867
+Grid : Message : 163.455690 s : 16		256		16		256		1584.625
+Grid : Message : 163.457209 s : 32		256		16		256		1949.422
+Grid : Message : 163.458254 s : 8		256		32		256		1389.417
+Grid : Message : 163.459339 s : 16		256		32		256		2668.344
+Grid : Message : 163.461158 s : 32		256		32		256		3234.162
+Grid : Message : 163.462566 s : 8		256		64		256		2150.925
+Grid : Message : 163.464066 s : 16		256		64		256		4012.488
+Grid : Message : 163.466272 s : 32		256		64		256		5154.786
+Grid : Message : 163.466276 s : ==================================================================================
+Grid : Message : 163.466277 s : ==================================================================================
+Grid : Message : 163.466278 s :  Communications benchmark 
+Grid : Message : 163.466279 s : ==================================================================================
+Grid : Message : 163.466280 s : ====================================================================================================
+Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions
+Grid : Message : 163.466281 s : ====================================================================================================
+Grid : Message : 163.466281 s :  L  	 Ls  	bytes	 MB/s uni  		 MB/s bidi 
+Grid : Message : 163.521339 s : 16	12	 4718592 	 122513.099		245026.198
+Grid : Message : 163.551417 s : 16	12	 4718592 	 125590.498		251180.996
+Grid : Message : 163.572339 s : 16	12	 4718592 	 180555.489		361110.977
+Grid : Message : 163.602810 s : 16	12	 4718592 	 123949.223		247898.447
+Grid : Message : 163.633041 s : 16	12	 4718592 	 124933.761		249867.523
+Grid : Message : 163.654084 s : 16	12	 4718592 	 179516.530		359033.061
+Grid : Message : 163.756280 s : 24	12	 15925248 	 127515.473		255030.946
+Grid : Message : 163.852651 s : 24	12	 15925248 	 132226.945		264453.890
+Grid : Message : 163.917510 s : 24	12	 15925248 	 196474.591		392949.183
+Grid : Message : 164.170390 s : 24	12	 15925248 	 128020.322		256040.644
+Grid : Message : 164.113321 s : 24	12	 15925248 	 132340.948		264681.896
+Grid : Message : 164.178314 s : 24	12	 15925248 	 196051.311		392102.622
+Grid : Message : 164.413983 s : 32	12	 37748736 	 129411.666		258823.333
+Grid : Message : 164.639218 s : 32	12	 37748736 	 134090.789		268181.577
+Grid : Message : 164.789675 s : 32	12	 37748736 	 200739.096		401478.191
+Grid : Message : 165.228910 s : 32	12	 37748736 	 129497.681		258995.363
+Grid : Message : 165.248096 s : 32	12	 37748736 	 134103.293		268206.586
+Grid : Message : 165.398958 s : 32	12	 37748736 	 200198.805		400397.611
+Grid : Message : 165.399411 s : ==================================================================================
+Grid : Message : 165.399413 s :  Per Node Summary table Ls=12
+Grid : Message : 165.399414 s : ==================================================================================
+Grid : Message : 165.399414 s :  L 		 Clover		 DWF4		 Staggered (GF/s per node)
+Grid : Message : 165.399417 s : 8 		 154914.003 	 1386335.817 	 49737.127
+Grid : Message : 165.399423 s : 12 		 693556.579 	 4208495.611 	 229778.435
+Grid : Message : 165.399426 s : 16 		 1840587.280 	 6674673.647 	 608844.000
+Grid : Message : 165.399429 s : 24 		 3933599.545 	 8572660.656 	 1641161.613
+Grid : Message : 165.399432 s : 32 		 5082757.996 	 9771387.820 	 3085543.742
+Grid : Message : 165.399435 s : ==================================================================================
+Grid : Message : 165.399435 s : ==================================================================================
+Grid : Message : 165.399435 s :  Comparison point     result: 9172024.238 Mflop/s per node
+Grid : Message : 165.399436 s :  Comparison point is 0.5*(9771387.820+8572660.656) 
+Grid : Message : 165.399438 s : ==================================================================================
+Grid : Message : 165.399438 s : *******************************************
+Grid : Message : 165.399438 s : ******* Grid Finalize                ******
+Grid : Message : 165.399438 s : *******************************************
--- a/systems/Frontier/benchmarks/benchusqcd.slurm
+++ b/systems/Frontier/benchmarks/benchusqcd.slurm
@@ -0,0 +1,38 @@
+#!/bin/bash -l
+#SBATCH --job-name=bench
+##SBATCH --partition=small-g
+##SBATCH -q debug
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-node=8
+#SBATCH --time=00:30:00
+#SBATCH --account=phy157_dwf
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+cat << EOF > select_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./select_gpu
+
+root=$HOME/Frontier/Grid/systems/Frontier/
+source ${root}/sourceme.sh
+
+export OMP_NUM_THREADS=7
+export MPICH_GPU_SUPPORT_ENABLED=1
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol  > Benchmark_usqcd.log
+
+
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@@ -3,7 +3,7 @@ spack load c-lime
 module load emacs 
 module load PrgEnv-gnu
 module load rocm
-module load cray-mpich/8.1.23
+module load cray-mpich
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
--- a/systems/Tursa/config-command
+++ b/systems/Tursa/config-command
@@ -2,11 +2,11 @@
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
-    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
+    --enable-gen-simd-width=64 \
+    --disable-gparity \
    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
-    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
-    LDFLAGS="-cudart shared " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+    LDFLAGS="-cudart shared -lcublas " \
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611"
--- a/systems/Tursa/sourceme.sh
+++ b/systems/Tursa/sourceme.sh
@@ -1,6 +1,7 @@
-module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1  
-#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
-export PREFIX=/home/tc002/tc002/shared/env/prefix/
-export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
+module load cuda/12.3 
+module load ucx/1.15.0-cuda12.3  
+module load openmpi/4.1.5-cuda12.3
+source /home/dp207/dp207/shared/env/production/env-base.sh 
+source /home/dp207/dp207/shared/env/production/env-gpu.sh 
 unset SBATCH_EXPORT

--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -142,7 +142,9 @@ int main (int argc, char ** argv)
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;

+  time_t now;
  time_t start = time(NULL);
+  UGrid->Broadcast(0,(void *)&start,sizeof(start));

  FlightRecorder::ContinueOnFail = 0;
  FlightRecorder::PrintEntireLog = 0;
@@ -162,9 +164,9 @@ int main (int argc, char ** argv)
    }
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
-    t1=usecond();
+    t1=usecond(); 
    mCG(src_o,result_o);
-    t2=usecond();
+    t2=usecond(); 
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
@@ -176,7 +178,8 @@ int main (int argc, char ** argv)

    std::cout << " FlightRecorder is OK! "<<std::endl;
    iter ++;
-  } while (time(NULL) < (start + nsecs/10) );
+    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
+  } while (now < (start + nsecs/10) );
    
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
@@ -189,7 +192,7 @@ int main (int argc, char ** argv)
    }
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
-    t1=usecond();
+    t1=usecond(); 
    CG(HermOpEO,src_o,result_o_2);
    t2=usecond();
    iters = CG.IterationsToComplete;
@@ -201,8 +204,9 @@ int main (int argc, char ** argv)
    std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
    assert(FlightRecorder::ErrorCount()==0);
    std::cout << " FlightRecorder is OK! "<<std::endl;
+    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
    i++;
-  } while (time(NULL) < (start + nsecs) );
+  } while (now < (start + nsecs) );

  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
--- a/tests/debug/Test_8888.cc
+++ b/tests/debug/Test_8888.cc
@@ -0,0 +1,118 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+
+using namespace std;
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=8;
+  const int nbasis = 40;
+  const int cb = 0 ;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  RealD b=1.0;
+  RealD c=0.0;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_EODWF_lat.125");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf);
+
+  
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Fine Power method            "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  LatticeFermionD pm_src(FGrid);
+  pm_src = ComplexD(1.0);
+  PowerMethod<LatticeFermionD>       fPM;
+  fPM(HermOp,pm_src);
+
+  
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Fine Lanczos  (poly, low)    "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  
+  int Nk=80;
+  int Nm=Nk*3;
+  int Nstop=8;
+  int Nconv_test_interval=1;
+  
+  //  Chebyshev<LatticeFermionD>      IRLChebyLo(0.2,64.0,201);  // 1 iter
+  Chebyshev<LatticeFermionD>      IRLChebyLo(0.0,55.0,101);  // 1 iter
+  FunctionHermOp<LatticeFermionD>    PolyOp(IRLChebyLo,HermOp);
+  PlainHermOp<LatticeFermionD>          Op(HermOp);
+
+  ImplicitlyRestartedLanczos IRL(PolyOp,
+				 Op,
+				 Nk, // sought vecs
+				 Nk, // sought vecs
+				 Nm, // spare vecs
+				 1.0e-8,
+				 10 // Max iterations
+				 );
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<LatticeFermionD>     evec(Nm,FGrid);
+  LatticeFermionD     irl_src(FGrid);
+
+  IRL.calc(eval,evec,irl_src,Nconv);
+
+  Grid_finalize();
+  return 0;
+}
--- a/tests/debug/Test_cayley_cg.cc
+++ b/tests/debug/Test_cayley_cg.cc
@@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf,
 		   GridParallelRNG *RNG5)
 {
  LatticeFermion src   (FGrid); random(*RNG5,src);
-  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion result1(FGrid); result1=Zero();
+  LatticeFermion result2(FGrid); result2=Zero();
+  LatticeFermion result3(FGrid); result3=Zero();

  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
-  SchurSolver(Ddwf,src,result);
+  SchurSolver(Ddwf,src,result1);
+
+  SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
+  SchurSolverSymm1(Ddwf,src,result2);
+
+  SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
+  SchurSolverSymm2(Ddwf,src,result3);
+
+  std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
+
+  std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl; 
+  result2=result2-result1;
+  std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl; 
+
+  std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl; 
+  result3=result3-result1;
+  std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl; 
 }
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@@ -244,7 +244,7 @@ int main (int argc, char ** argv)

  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 

-  
+#if 0  
  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
  
@@ -307,7 +307,8 @@ int main (int argc, char ** argv)
    rh_res= Zero();
    mrhsCG(MrhsCoarseOp,rh_src,rh_res);
  }
-  
+
+#endif
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
-#include <Grid/algorithms/iterative/AdefGeneric.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>

 using namespace std;
 using namespace Grid;

-template<class Coarsened>
-void SaveOperator(Coarsened &Operator,std::string file)
-{
-#ifdef HAVE_LIME
-  emptyUserRecord record;
-  ScidacWriter WR(Operator.Grid()->IsBoss());
-  assert(Operator._A.size()==Operator.geom.npoint);
-  WR.open(file);
-  for(int p=0;p<Operator._A.size();p++){
-    auto tmp = Operator.Cell.Extract(Operator._A[p]);
-    WR.writeScidacFieldRecord(tmp,record);
-  }
-  WR.close();
-#endif
-}
-template<class Coarsened>
-void LoadOperator(Coarsened &Operator,std::string file)
-{
-#ifdef HAVE_LIME
-  emptyUserRecord record;
-  Grid::ScidacReader RD ;
-  RD.open(file);
-  assert(Operator._A.size()==Operator.geom.npoint);
-  for(int p=0;p<Operator.geom.npoint;p++){
-    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
-    RD.readScidacFieldRecord(Operator._A[p],record);
-  }    
-  RD.close();
-  Operator.ExchangeCoarseLinks();
-#endif
-}
-template<class aggregation>
-void SaveBasis(aggregation &Agg,std::string file)
-{
-#ifdef HAVE_LIME
-  emptyUserRecord record;
-  ScidacWriter WR(Agg.FineGrid->IsBoss());
-  WR.open(file);
-  for(int b=0;b<Agg.subspace.size();b++){
-    WR.writeScidacFieldRecord(Agg.subspace[b],record);
-  }
-  WR.close();
-#endif
-}
-template<class aggregation>
-void LoadBasis(aggregation &Agg, std::string file)
-{
-#ifdef HAVE_LIME
-  emptyUserRecord record;
-  ScidacReader RD ;
-  RD.open(file);
-  for(int b=0;b<Agg.subspace.size();b++){
-    RD.readScidacFieldRecord(Agg.subspace[b],record);
-  }    
-  RD.close();
-#endif
-}
-
-
-template<class Field> class TestSolver : public LinearFunction<Field> {
-public:
-  TestSolver() {};
-  void operator() (const Field &in, Field &out){    out = Zero();  }     
-};
-
-
-RealD InverseApproximation(RealD x){
-  return 1.0/x;
-}
-
 // Want Op in CoarsenOp to call MatPcDagMatPc
 template<class Field>
 class HermOpAdaptor : public LinearOperatorBase<Field>
@@ -119,33 +48,37 @@ public:
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 };
-template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
-  Chebyshev<Field> Cheby;
-  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
-    Cheby(_lo,_hi,_ord,InverseApproximation)
+    iters(_iters)
  {
-    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
-    Field tmp(in.Grid());
-    tmp = in;
-    Cheby(_SmootherOperator,tmp,out);
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+
+    out=Zero();
+
+    CG(_SmootherOperator,in,out);
  }
 };

+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

  const int Ls=24;
-  const int nbasis = 40;
+  const int nbasis = 60;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
@@ -160,10 +93,12 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,4,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/4;
+    clatt[d] = clatt[d]/Block[d];
  }
+
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 							    GridDefaultMpi());;
@@ -182,7 +117,7 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(UGrid);

  FieldMetaData header;
-  std::string file("ckpoint_lat.4000");
+  std::string file("ckpoint_EODWF_lat.125");
  NerscIO::readConfiguration(Umu,header,file);

  //////////////////////// Fermion action //////////////////////////////////
@@ -192,15 +127,7 @@ int main (int argc, char ** argv)

  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
-  
-  LatticeFermion result(FrbGrid); result=Zero();

-  LatticeFermion    src(FrbGrid); random(RNG5,src);
-
-  // Run power method on FineHermOp
-  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
-
- 
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
@@ -208,219 +135,170 @@ int main (int argc, char ** argv)
  typedef LittleDiracOperator::CoarseVector CoarseVector;

  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
-  NearestStencilGeometry5D geom_nn(Coarse5d);
-  
-  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);

  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
-  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);

-  bool load=false;
-  if ( load ) {
-    LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
-    LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
-  } else {
-    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
-				       95.0,0.1,
-				       //				     400,200,200 -- 48 iters
-				       //				     600,200,200 -- 38 iters, 162s
-				       //				     600,200,100 -- 38 iters, 169s
-				       //				     600,200,50  -- 88 iters. 370s 
-				       800,
-				       200,
-				       100,
-				       0.0);
-    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
-    SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
-    SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
-  }
-  
-  // Try projecting to one hop only
-  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
-  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
+  int refine=1;
+    //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+    //    					0.0003,1.0e-5,2000); // Lo, tol, maxit
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Create Subspace"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 

-  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
-  HermMatrix CoarseOp     (LittleDiracOp);
-  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Refine Subspace"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
  
-  //////////////////////////////////////////
-  // Build a coarse lanczos
-  //////////////////////////////////////////
-  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
-  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
-  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
-  int Nk=48;
-  int Nm=64;
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Coarsen after refine"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  Aggregates.Orthogonalise();
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
+    
+  const int nrhs=12;
+    
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+    
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
+  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
+
+  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
+  
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Coarse Lanczos               "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
+  Chebyshev<CoarseVector>      IRLCheby(0.01,42.0,301);  // 1 iter
+  MrhsHermMatrix MrhsCoarseOp     (mrhs);
+
+  CoarseVector pm_src(CoarseMrhs);
+  pm_src = ComplexD(1.0);
+  PowerMethod<CoarseVector>       cPM;
+  cPM(MrhsCoarseOp,pm_src);
+
+  int Nk=192;
+  int Nm=384;
  int Nstop=Nk;
-  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
+  int Nconv_test_interval=1;
+  
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
+							  Coarse5d,
+							  CoarseMrhs,
+							  nrhs,
+							  IRLCheby,
+							  Nstop,
+							  Nconv_test_interval,
+							  nrhs,
+							  Nk,
+							  Nm,
+							  1e-5,10);

  int Nconv;
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
-  CoarseVector c_src(Coarse5d);
-  //c_src=1.0;
-  random(CRNG,c_src);
-
-  CoarseVector c_res(Coarse5d); 
-  CoarseVector c_ref(Coarse5d); 
-
-  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
-
-  IRL.calc(eval,evec,c_src,Nconv);
-  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);

  //////////////////////////////////////////
-  // Build a coarse space solver
+  // Block projector for coarse/fine
  //////////////////////////////////////////
-  int maxit=20000;
-  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
-  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false);
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;

-  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
-  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
-  c_res=Zero();
-  HPDSolve(c_src,c_res); c_ref = c_res;
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
-  //////////////////////////////////////////////////////////////////////////
-  // Deflated (with real op EV's) solve for the projected coarse op
-  // Work towards ADEF1 in the coarse space
-  //////////////////////////////////////////////////////////////////////////
-  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
-  c_res=Zero();
-  HPDSolveProj(c_src,c_res);
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
-  c_res = c_res - c_ref;
-  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling mRHS HDCG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);

-  //////////////////////////////////////////////////////////////////////
-  // Coarse ADEF1 with deflation space
-  //////////////////////////////////////////////////////////////////////
-  ChebyshevSmoother<CoarseVector,HermMatrix >
-    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
-    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
-  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
-  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
-  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
-  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
-  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
-
-  ////////////////////////////////////////////////////////
-  // CG, Cheby mode spacing 200,200
-  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
-  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
-  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
-  ////////////////////////////////////////////////////////
-  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
-  ////////////////////////////////////////////////////////
-  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
-  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
-  // HDCG 38 iters 162s
-  //
-  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
-  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
-  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
-  // HDCG 38 iters 169s
-
-  TwoLevelADEF1defl<CoarseVector>
-    cADEF1(1.0e-8, 500,
-	   CoarseOp,
-	   CoarseSmoother,
-	   evec,eval);
-
-  c_res=Zero();
-  cADEF1(c_src,c_res);
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
-  c_res = c_res - c_ref;
-  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
-  
-  //  cADEF1.Tolerance = 4.0e-2;
-  //  cADEF1.Tolerance = 1.0e-1;
-  cADEF1.Tolerance = 5.0e-2;
-  c_res=Zero();
-  cADEF1(c_src,c_res);
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
-  c_res = c_res - c_ref;
-  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
-  
-  //////////////////////////////////////////
-  // Build a smoother
-  //////////////////////////////////////////
-  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
-  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
-  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
-  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
-  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
-  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
-  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
-
-  //
-  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
-  // Need to measure cost of coarse space.
-  //
-  // -- i) Reduce coarse residual   -- 0.04
-  // -- ii) Lanczos on coarse space -- done
-  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
-  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
-  //
-  std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
-
-  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
-  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
-
-  for(int l=0;l<los.size();l++){
-
-    RealD lo = los[l];
-
-    for(int o=0;o<ords.size();o++){
-
-      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
-      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
-      
-      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
-      ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311
-
-      //////////////////////////////////////////
-      // Build a HDCG solver
-      //////////////////////////////////////////
-      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
-	HDCG(1.0e-8, 100,
-	     FineHermOp,
-	     Smoother,
-	     HPDSolveSloppy,
-	     HPDSolve,
-	     Aggregates);
-
-      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
-	HDCGdefl(1.0e-8, 100,
-		 FineHermOp,
-		 Smoother,
-		 cADEF1,
-		 HPDSolve,
-		 Aggregates);
-      
-      result=Zero();
-      HDCGdefl(src,result);
-
-      result=Zero();
-      HDCG(src,result);
-
-      
-    }
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Recompute coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  evec.resize(Nm,Coarse5d);
+  eval.resize(Nm);
+  for(int r=0;r<nrhs;r++){
+    random(CRNG,c_src[r]);
  }

-  // Standard CG
-  result=Zero();
-  CGfine(HermOpEO, src, result);
+  IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
+
+  ///////////////////////
+  // Deflation guesser object
+  ///////////////////////
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Reimport coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+  MrhsGuesser.ImportEigenBasis(evec,eval);
+      
+  //////////////////////////
+  // Extra HDCG parameters
+  //////////////////////////
+  int maxit=3000;
+  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
+  RealD lo=2.0;
+  int ord = 7;
+
+  DoNothingGuesser<CoarseVector> DoNothing;
+  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
+
+  /////////////////////////////////////////////////
+  // Mirs smoother
+  /////////////////////////////////////////////////
+  RealD MirsShift = lo;
+  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+
+  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
+    HDCGmrhs(1.0e-8, 500,
+	     FineHermOp,
+	     CGsmooth,
+	     HPDSolveMrhs,    // Used in M1
+	     HPDSolveMrhs,          // Used in Vstart
+	     MrhsProjector,
+	     MrhsGuesser,
+	     CoarseMrhs);
+    
+  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
+  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
  
+  for(int r=0;r<nrhs;r++){
+    random(RNG5,src_mrhs[r]);
+    res_mrhs[r]=Zero();
+  }
+  
+  HDCGmrhs(src_mrhs,res_mrhs);
+
+  // Standard CG
+#if 1
+  {
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling red black CG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+      
+    LatticeFermion result(FrbGrid); result=Zero();
+    LatticeFermion    src(FrbGrid); random(RNG5,src);
+    result=Zero();
+
+    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+    CGfine(HermOpEO, src, result);
+  }
+#endif  
  Grid_finalize();
  return 0;
 }
--- a/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
@@ -145,7 +145,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  const int Ls=24;
-  const int nbasis = 60;
+  const int nbasis = 62;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
@@ -160,7 +160,7 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

  // Construct a coarsened grid with 4^4 cell
-  Coordinate Block({4,4,4,4});
+  Coordinate Block({4,4,6,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/Block[d];
--- a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
@@ -0,0 +1,396 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+template<class CoarseVector>
+void SaveEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(evec[0].Grid()->IsBoss());
+  WR.open(evec_file);
+  for(int b=0;b<evec.size();b++){
+    WR.writeScidacFieldRecord(evec[b],record,0,0);
+  }
+  WR.close();
+  XmlWriter WRx(eval_file);
+  write(WRx,"evals",eval);
+#endif
+}
+template<class CoarseVector>
+void LoadEigenvectors(std::vector<RealD>            &eval,
+		      std::vector<CoarseVector>     &evec,
+		      std::string evec_file,
+		      std::string eval_file)
+{
+#ifdef HAVE_LIME
+    XmlReader RDx(eval_file);
+    read(RDx,"evals",eval);
+    emptyUserRecord record;
+
+    Grid::ScidacReader RD ;
+    RD.open(evec_file);
+    assert(evec.size()==eval.size());
+    for(int k=0;k<eval.size();k++) {
+      RD.readScidacFieldRecord(evec[k],record);
+    }
+    RD.close();
+#endif
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+
+    out=Zero();
+
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 60;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  //  Coordinate Block({4,4,6,4});
+  Coordinate Block({4,4,4,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+
+  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60");
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60");
+  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
+  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
+  bool load_agg=true;
+  bool load_refine=true;
+  bool load_mat=false;
+  bool load_evec=false;
+
+  int refine=1;
+  if ( load_agg ) {
+    if ( !(refine) || (!load_refine) ) { 
+      LoadBasis(Aggregates,subspace_file);
+    }
+  } else {
+    Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
+    SaveBasis(Aggregates,subspace_file);
+  }
+
+  if ( load_refine ) {
+    std::cout << " Load Refine "<< refine_file <<std::endl;
+    LoadBasis(Aggregates,refine_file);
+  } else {
+    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters
+    //    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters
+    SaveBasis(Aggregates,refine_file);
+  }
+  Aggregates.Orthogonalise();
+  
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+    
+  const int nrhs=12;
+    
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+    
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
+  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
+
+  ///////////////////////
+  // Deflation guesser object
+  ///////////////////////
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+
+  //////////////////////////////////////////
+  // Block projector for coarse/fine
+  //////////////////////////////////////////
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Coarsen after refine"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "         Coarse Lanczos               "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+
+  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
+  //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 4.4.6.4
+  //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,501);  // for 4.4.4.4 blocking 350 evs
+  //  Chebyshev<CoarseVector>      IRLCheby(0.0014,42.0,501);  // for 4.4.4.4 blocking 700 evs
+  //  Chebyshev<CoarseVector>      IRLCheby(0.002,42.0,501);  // for 4.4.4.4 blocking 1226 evs
+  //  Chebyshev<CoarseVector>      IRLCheby(0.0025,42.0,501);  // for 4.4.4.4 blocking 1059 evs
+							  //							  3e-4,2);
+  Chebyshev<CoarseVector>      IRLCheby(0.0018,42.0,301);  // for 4.4.4.4 blocking  // 790 evs
+  
+  MrhsHermMatrix MrhsCoarseOp     (mrhs);
+
+  CoarseVector pm_src(CoarseMrhs);
+  pm_src = ComplexD(1.0);
+  PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src);
+
+  //  int Nk=nrhs*30; // 4.4.6.4
+  //  int Nk=nrhs*80;
+  int Nk=nrhs*60; // 720
+  int Nm=Nk*4;    // 2880 ; generally finishes at 1440
+  int Nstop=512;
+  int Nconv_test_interval=1;
+  
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
+							  Coarse5d,
+							  CoarseMrhs,
+							  nrhs,
+							  IRLCheby,
+							  Nstop,
+							  Nconv_test_interval,
+							  nrhs,
+							  Nk,
+							  Nm,
+							  3e-4,2);
+
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " Recompute coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  evec.resize(Nm,Coarse5d);
+  eval.resize(Nm);
+  for(int r=0;r<nrhs;r++){
+    random(CRNG,c_src[r]);
+  }
+  int Nconv;
+  IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl);
+  Nconv = eval.size();
+  std::cout << "**************************************"<<std::endl;
+  std::cout << " import coarse evecs  "<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsGuesser.ImportEigenBasis(evec,eval);
+
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling mRHS HDCG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+
+  //////////////////////////
+  // Extra HDCG parameters
+  //////////////////////////
+  int maxit=3000;
+  ConjugateGradient<CoarseVector>  CG(7.5e-2,maxit,false);
+  RealD lo=2.0;
+  int ord = 7;
+
+  DoNothingGuesser<CoarseVector> DoNothing;
+  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
+  HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing);
+
+  /////////////////////////////////////////////////
+  // Mirs smoother
+  /////////////////////////////////////////////////
+  RealD MirsShift = lo;
+  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
+  
+  
+  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
+    HDCGmrhs(1.0e-8, 500,
+	     FineHermOp,
+	     CGsmooth,
+	     HPDSolveMrhs,    // Used in M1
+	     HPDSolveMrhs,          // Used in Vstart
+	     MrhsProjector,
+	     MrhsGuesser,
+	     CoarseMrhs);
+    
+  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
+  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
+  
+  for(int r=0;r<nrhs;r++){
+    random(RNG5,src_mrhs[r]);
+    res_mrhs[r]=Zero();
+  }
+  
+  HDCGmrhs(src_mrhs,res_mrhs);
+
+  // Standard CG
+#if 0
+  {
+  std::cout << "**************************************"<<std::endl;
+  std::cout << "Calling red black CG"<<std::endl;
+  std::cout << "**************************************"<<std::endl;
+      
+    LatticeFermion result(FrbGrid); result=Zero();
+    LatticeFermion    src(FrbGrid); random(RNG5,src);
+    result=Zero();
+
+    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+    CGfine(HermOpEO, src, result);
+  }
+#endif  
+  Grid_finalize();
+  return 0;
+}
Author	SHA1	Message	Date
Peter Boyle	bffd30abec	Optimise lie algebra project	2024-09-19 15:48:09 -04:00
Peter Boyle	da919949f9	Clean up the accelerator pick/set checkerboard	2024-08-23 12:34:41 -04:00
Peter Boyle	b12b4fdaff	Attempt at operating on half checkerboard	2024-08-23 11:05:09 -04:00
Peter Boyle	557fa483ff	Blas benchmark committed stand alone	2024-08-20 16:18:43 +00:00
Peter Boyle	fc15d55df6	Mallinfo	2024-08-20 14:33:09 +00:00
Peter Boyle	53573d7d94	Better benchmark	2024-08-20 14:31:57 +00:00
Peter Boyle	bb3c177000	Better benchmarking	2024-08-20 14:31:41 +00:00
Peter Boyle	a3322b470f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-08-20 14:30:52 +00:00
Peter Boyle	f8f408e7a9	BLAS everywhere	2024-07-25 18:09:02 +00:00
Peter Boyle	baac1127d0	Later intel compiler happiness	2024-07-25 18:06:05 +00:00
Peter Boyle	6f1328160c	Remove SVM use	2024-07-25 18:05:40 +00:00
Peter Boyle	04cf902791	Mallinfo and ASAN hooks	2024-07-25 18:04:56 +00:00
Peter Boyle	7a5b1c1a19	Try Catch convenience macro	2024-07-25 18:03:41 +00:00
Peter Boyle	18d2d7da4a	Eigen implementation and SYCL implementation	2024-07-25 18:02:56 +00:00
Peter Boyle	b461184797	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-07-23 09:53:58 -04:00
Peter Boyle	4563b39305	New Frontier config	2024-07-23 09:53:08 -04:00
Peter Boyle	c9d5674d5b	FInal for paper	2024-07-22 15:26:45 -04:00
Peter Boyle	486412635a	8^4 test for PETSc	2024-07-22 15:25:17 -04:00
Peter Boyle	8b23a1546a	Force compile temporarily	2024-07-22 15:24:56 -04:00
Peter Boyle	a901e4e369	Regressed performance for paper	2024-07-22 15:24:04 -04:00
Peter Boyle	804d9367d4	Regressed performance	2024-07-22 15:23:25 -04:00
Peter Boyle	41d8adca95	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-07-11 15:38:45 +00:00
Peter Boyle	059e8e5bb0	New compile option	2024-07-11 15:37:30 +00:00
Peter Boyle	b3ee8ded96	Respect command line	2024-07-11 15:34:48 +00:00
Peter Boyle	cf3584ad15	Convenient to monitor memory across an HMC trajectory	2024-07-11 15:30:32 +00:00
Peter Boyle	a66973163f	Device vector not UVM	2024-07-11 15:24:11 +00:00
Peter Boyle	4502a8c8a1	libc malloc heap info dump on Linux	2024-07-11 15:22:18 +00:00
Peter Boyle	9c902e4c2d	Batched blas, but not working yet on OneAPI	2024-07-11 15:19:49 +00:00
Peter Boyle	f3eb36adcf	Namespace addition	2024-07-11 15:19:19 +00:00
Peter Boyle	7c246606c1	Schur additional case	2024-07-10 22:04:32 +00:00
Peter Boyle	172c75029e	Redblack additional case	2024-07-10 22:03:59 +00:00
Peter Boyle	6ae52da571	LLVM leak sanitizer	2024-07-08 15:59:18 +00:00
Peter Boyle	4ee9c68053	Updated compile environment	2024-07-08 15:57:57 +00:00
Peter Boyle	a15b4378a3	Sanitizer preservation of options	2024-07-08 15:57:45 +00:00
Peter Boyle	89fdd7f8dd	AOT compilation	2024-07-05 17:47:56 +00:00
Peter Boyle	c328be24b7	Sanitizer compile options	2024-07-05 17:46:43 +00:00
Peter Boyle	a73dc6dbf4	Display linux heap info	2024-06-28 16:05:17 +00:00
Peter Boyle	eee2a2657f	Try catch exception wrappers	2024-06-28 16:02:29 +00:00
Peter Boyle	12b8be7cb9	Best so far on 96^3 350 Evecs converged on 4^4 block	2024-06-18 16:31:37 -04:00
Peter Boyle	63c223ea5d	Verbose	2024-06-18 03:22:01 +00:00
Peter Boyle	2877fb4a2c	More verbose if alloc failure	2024-06-18 03:21:03 +00:00
Peter Boyle	d299c86633	Std::asin,acos	2024-06-11 16:41:23 -04:00
Peter Boyle	6ce52092e8	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-06-11 15:16:58 -04:00
Peter Boyle	b5926c1d21	Broadcast time info	2024-06-11 15:16:25 -04:00
Peter Boyle	9563238e9b	Force initial to identity	2024-06-11 17:51:58 +00:00
Peter Boyle	fb9b1d76ca	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-06-11 16:48:16 +00:00
Peter Boyle	1739146599	Property to initialise reduction	2024-06-11 16:47:35 +00:00
Peter Boyle	ed20b39ab3	Log files from Frontier benchmark	2024-06-11 11:16:20 -04:00
Peter Boyle	284fc05f15	Protect vs. missing LIME libarary	2024-06-11 11:08:00 -04:00
Peter Boyle	07a07b6fa3	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-06-10 15:09:25 -04:00
Peter Boyle	dc80b08969	96^3 test	2024-06-10 15:07:29 -04:00
Peter Boyle	a49a161f8d	SYCL update to use buffer on reduction variable	2024-06-08 16:05:18 +00:00
Peter Boyle	a6479ca50f	Shuhei's ComputeWilsonFlow main programme	2024-06-05 15:51:11 -04:00
Peter Boyle	0e607a55e7	Updated for 8^4 test	2024-05-26 20:53:05 +00:00
Peter Boyle	c4b9f71357	CPU compile ordering is important	2024-05-21 02:22:32 +01:00
Peter Boyle	394e506aea	Compile options for tursa update	2024-05-21 02:10:04 +01:00
Peter Boyle	e19b26341b	Tursa configure update	2024-05-21 01:14:27 +01:00
Peter Boyle	cfe1b13225	Back out zero change	2024-05-21 01:14:08 +01:00
Peter Boyle	890c5ea1cd	Warning disable	2024-05-20 20:08:31 +01:00
Peter Boyle	a87378d3b6	Update	2024-05-20 20:08:31 +01:00
Peter Boyle	832fc08809	Merge pull request #459 from dbollweg/sycl_slicesum_update Sycl slicesum bugfix	2024-05-20 15:06:53 -04:00
dbollweg	461cd045c6	sliceSum cleanup	2024-03-13 18:18:44 -04:00
dbollweg	fee65d7a75	Merge branch 'paboyle:develop' into sycl_slicesum_update	2024-03-13 18:06:17 -04:00
dbollweg	31f9971dbf	avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum	2024-03-13 13:39:26 -04:00
dbollweg	d87296f3e8	Merge branch 'develop' of https://github.com/dbollweg/Grid into develop	2024-03-06 16:54:22 -05:00
dbollweg	be94cf1c6f	Fewer wait-calls in sycl slicesum	2024-03-06 16:53:13 -05:00
				`@@ -0,0 +1,2 @@`

				`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench`