Optimise lie algebra project

Clean up the accelerator pick/set checkerboard
Attempt at operating on half checkerboard
2025-10-31 20:14:32 +00:00 · 2024-09-19 15:48:09 -04:00 · 2024-08-23 12:34:41 -04:00 · 2024-08-23 11:05:09 -04:00 · 2024-08-20 16:18:43 +00:00 · 2024-08-20 14:33:09 +00:00
46 changed files with 3662 additions and 727 deletions
--- a/BLAS_benchmark/BatchBlasBench.cc
+++ b/BLAS_benchmark/BatchBlasBench.cc
--- a/BLAS_benchmark/compile-command
+++ b/BLAS_benchmark/compile-command
@@ -0,0 +1,2 @@
 mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
--- a/Grid/Namespace.h
+++ b/Grid/Namespace.h
@@ -30,9 +30,14 @@ directory
 #include <type_traits>
 #include <cassert>
 #include <exception>
 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
 #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 #define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
 #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
 #define EXCEPTION_CHECK_BEGIN(A) try {
 #define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -89,9 +89,10 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::cpu_selector selector;
+      cl::sycl::gpu_selector selector;
      cl::sycl::device selectedDevice { selector };
-      gridblasHandle =new sycl::queue (selectedDevice);
+      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
    }
@@ -207,6 +208,9 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
    assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -266,26 +270,130 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
+      int64_t m64=m;
-#warning "oneMKL implementation not built "
+      int64_t n64=n;
-#endif
+      int64_t k64=k;
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+      int64_t lda64=lda;
-    // Need a default/reference implementation
+      int64_t ldb64=ldb;
-    int sda = lda*k;
+      int64_t ldc64=ldc;
-    int sdb = ldb*k;
+      int64_t batchCount64=batchCount;
-    int sdc = ldc*n;
+
-    for (int p = 0; p < batchCount; ++p) {
+      oneapi::mkl::transpose iOpA;
-      for (int mm = 0; mm < m; ++mm) {
+      oneapi::mkl::transpose iOpB;
-	for (int nn = 0; nn < n; ++nn) {
+      
-	  ComplexD c_mn(0.0);
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
-	  for (int kk = 0; kk < k; ++kk)
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (ComplexD *) &alpha_p[0],
 						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64,
 						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
 						  (ComplexD *) &beta_p[0],
 						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #if 0
      // This code was used to check the mat mul on Sunspot/OneMKL
      std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
      std::vector<ComplexD> A(m*k);  // pointer list to matrices
      std::vector<ComplexD> B(k*n);
      std::vector<ComplexD> C(m*n);
      //      int sda = lda*k;
      //      int sdb = ldb*k;
      //      int sdc = ldc*n;
      std::cerr << " Checking the GEMM results "<<std::endl;
      for (int p = 0; p < 1; ++p) {
 	ComplexD * Amk_p;  // pointer list to matrices
 	ComplexD * Bkn_p;  // pointer list to matrices
 	ComplexD * Cmn_p;  // pointer list to matrices
 	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
 	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
 	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
 	std::cerr << " p " << p << " copied pointers "<<std::endl;
 	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
 	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
 	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
 	std::cerr << " p " << p << " copied matrices "<<std::endl;
 	std::cerr << " C[0] "<<C[0]<<std::endl;
 	std::cerr << " A[0] "<<A[0]<<std::endl;
 	std::cerr << " B[0] "<<B[0]<<std::endl;
 	std::cerr << " m "<<m<<std::endl;
 	std::cerr << " n "<<n<<std::endl;
 	std::cerr << " k "<<k<<std::endl;
 	for (int mm = 0; mm < m; ++mm) {
 	  for (int nn = 0; nn < n; ++nn) {
 	    ComplexD c_mn(0.0);
 	    for (int kk = 0; kk < k; ++kk) {
 	      int idx_a, idx_b;
 	      //    int lda = m; // m x k column major
 	      //    int ldb = k; // k x n column major
 	      //    int ldc = m; // m x b column major
 	      if(OpA!=GridBLAS_OP_N) {
 		idx_a =kk + mm*lda;
 	      } else {
 		idx_a =mm + kk*lda;
 	      }
 	      if(OpB!=GridBLAS_OP_N) {
 		idx_b =nn + kk*ldb;
 	      } else {
 		idx_b =kk + nn*ldb;
 	      }
 	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
 	      ComplexD Ac = A[idx_a];
 	      ComplexD Bc = B[idx_b];
 	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
 	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
 	      c_mn += Ac*Bc;
 	    }
 	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
 	  }
 	}
      }
    }
 #endif
-    //    synchronise();
+#endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
        });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else { 
 	assert(0);
      }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@@ -306,6 +414,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
    assert(OpB!=GridBLAS_OP_T);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -366,26 +477,69 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
+      int64_t m64=m;
-#warning "oneMKL implementation not built "
+      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (ComplexF *) &alpha_p[0],
 						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
 						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
 						  (ComplexF *) &beta_p[0],
 						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
+    // Need a default/reference implementation; use Eigen
-    int sdb = ldb*k;
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
-    int sdc = ldc*n;
+	thread_for (p, batchCount, {
-    ComplexF alphaf(real(alpha),imag(alpha));
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
-    ComplexF betaf(real(beta),imag(beta));
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
-    // Need a default/reference implementation
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-    for (int p = 0; p < batchCount; ++p) {
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-      for (int mm = 0; mm < m; ++mm) {
+	  });
-	for (int nn = 0; nn < n; ++nn) {
+      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
-	  ComplexF c_mn(0.0);
+	thread_for (p, batchCount, {
-	  for (int kk = 0; kk < k; ++kk)
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
-	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	}
+	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 	  } );
      } else { 
 	assert(0);
      }
    }
 #endif
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
@@ -408,6 +562,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
    assert(OpB!=GridBLAS_OP_C);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -467,24 +624,69 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
+      int64_t m64=m;
-#warning "oneMKL implementation not built "
+      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
      oneapi::mkl::transpose iOpA;
      oneapi::mkl::transpose iOpB;
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (float *) &alpha_p[0],
 						  (const float **)&Amk[0], (const int64_t *)&lda64,
 						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
 						  (float *) &beta_p[0],
 						  (float **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
+    // Need a default/reference implementation; use Eigen
-    int sdb = ldb*k;
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
-    int sdc = ldc*n;
+	thread_for (p, batchCount, {
-    // Need a default/reference implementation
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
-    for (int p = 0; p < batchCount; ++p) {
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
-      for (int mm = 0; mm < m; ++mm) {
+	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	for (int nn = 0; nn < n; ++nn) {
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-	  RealD c_mn(0.0);
+	  });
-	  for (int kk = 0; kk < k; ++kk)
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	thread_for (p, batchCount, {
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
-	}
+	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
      } else { 
 	assert(0);
      }
    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
@@ -495,7 +697,6 @@ public:
  ///////////////////////////////////////////////////////////////////////////
  // Double precision real GEMM
  ///////////////////////////////////////////////////////////////////////////
  void gemmBatched(GridBLASOperation_t OpA,
 		   GridBLASOperation_t OpB,
 		   int m,int n, int k,
@@ -508,6 +709,9 @@ public:
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
    assert(OpB!=GridBLAS_OP_C);
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
@@ -568,160 +772,124 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
    /*
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
      int64_t lda64=lda;
      int64_t ldb64=ldb;
      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
-      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
+
-      onemkl::transpose::N,
+      oneapi::mkl::transpose iOpA;
-      onemkl::transpose::N,
+      oneapi::mkl::transpose iOpB;
-      &m64,&n64,&k64,
+      
-      (double *) &alpha_p[0],
+      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
-      (double **)&Amk[0], lda,
+      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
-      (double **)&Bkn[0], ldb,
+      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
-      (double *) &beta_p[0],
+      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
-      (double **)&Cmn[0], ldc,
+      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
-      1,&batchCount64);
+      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
-     */
+
-    //MKL’s cblas_<T>gemm_batch & OneAPI
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-#warning "oneMKL implementation not built "
+						  &iOpA,
 						  &iOpB,
 						  &m64,&n64,&k64,
 						  (double *) &alpha_p[0],
 						  (const double **)&Amk[0], (const int64_t *)&lda64,
 						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
 						  (double *) &beta_p[0],
 						  (double **)&Cmn[0], (const int64_t *)&ldc64,
 						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
      synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    int sda = lda*k;
+    // Need a default/reference implementation; use Eigen
-    int sdb = ldb*k;
+      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
-    int sdc = ldc*n;
+	thread_for (p, batchCount, {
-    // Need a default/reference implementation
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
-    for (int p = 0; p < batchCount; ++p) {
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
-      for (int mm = 0; mm < m; ++mm) {
+	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	for (int nn = 0; nn < n; ++nn) {
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
-	  RealD c_mn(0.0);
+	  });
-	  for (int kk = 0; kk < k; ++kk)
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
-	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
+	thread_for (p, batchCount, {
-	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
+	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
-	}
+	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 	  });
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
      } else { 
 	assert(0);
      }
    }
 #endif
     RealD t1=usecond();
     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Strided case used by benchmark, but generally unused in Grid
  // Keep a code example in double complex, but don't generate the single and real variants for now
  ////////////////////////////////////////////////////////////////////////////////////////////////
  void gemmStridedBatched(int m,int n, int k,
 			  ComplexD alpha,
 			  ComplexD* Amk,  // pointer list to matrices
 			  ComplexD* Bkn,
 			  ComplexD beta,
 			  ComplexD* Cmn,
 			  int batchCount)
  {
    // Use C-row major storage, so transpose calls
    int lda = m; // m x k column major
    int ldb = k; // k x n column major
    int ldc = m; // m x b column major
    int sda = m*k;
    int sdb = k*n;
    int sdc = m*n;
    deviceVector<ComplexD> alpha_p(1);
    deviceVector<ComplexD> beta_p(1);
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
 #ifdef GRID_HIP
    auto err = hipblasZgemmStridedBatched(gridblasHandle,
 					  HIPBLAS_OP_N,
 					  HIPBLAS_OP_N,
 					  m,n,k,
 					  (hipblasDoubleComplex *) &alpha_p[0],
 					  (hipblasDoubleComplex *) Amk, lda, sda,
 					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
 					  (hipblasDoubleComplex *) &beta_p[0],
 					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
 					  batchCount);
    assert(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
    cublasZgemmStridedBatched(gridblasHandle,
 			      CUBLAS_OP_N,
 			      CUBLAS_OP_N,
 			      m,n,k,
 			      (cuDoubleComplex *) &alpha_p[0],
 			      (cuDoubleComplex *) Amk, lda, sda,
 			      (cuDoubleComplex *) Bkn, ldb, sdb,
 			      (cuDoubleComplex *) &beta_p[0],
 			      (cuDoubleComplex *) Cmn, ldc, sdc,
 			      batchCount);
 #endif
 #if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 						oneapi::mkl::transpose::N,
 						oneapi::mkl::transpose::N,
 						m,n,k,
 						alpha,
 						(const ComplexD *)Amk,lda,sda,
 						(const ComplexD *)Bkn,ldb,sdb,
 						beta,
 						(ComplexD *)Cmn,ldc,sdc,
 						batchCount);
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
     // Need a default/reference implementation
     for (int p = 0; p < batchCount; ++p) {
       for (int mm = 0; mm < m; ++mm) {
 	 for (int nn = 0; nn < n; ++nn) {
 	   ComplexD c_mn(0.0);
 	   for (int kk = 0; kk < k; ++kk)
 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
 	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
 	 }
       }
     }
 #endif
  }
  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
    int32_t N_A = M*K*BATCH;
    int32_t N_B = K*N*BATCH;
    int32_t N_C = M*N*BATCH;
-    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
+    deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
-    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
+    deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
-    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
+    deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
-    ComplexD alpha(1.0);
+    CComplex alpha(1.0);
-    ComplexD beta (1.0);
+    CComplex beta (1.0);
    RealD flops = 8.0*M*N*K*BATCH;
-    int ncall=10;
+    int ncall=1000;
    deviceVector<CComplex *> As(BATCH);
    deviceVector<CComplex *> Bs(BATCH);
    deviceVector<CComplex *> Cs(BATCH);
    for(int b = 0 ; b < BATCH;b++) {
      CComplex *ptr;
      ptr = &A[b*M*K];      acceleratorPut(As[b],ptr);
      ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr);
      ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr);
    }
    // Warm up call
    gemmBatched(M,N,K,
 		alpha,
 		As, // m x k 
 		Bs, // k x n
 		beta, 
 		Cs);
    synchronise();
    RealD t0 = usecond();
    for(int i=0;i<ncall;i++){
-      gemmStridedBatched(M,N,K,
+      gemmBatched(M,N,K,
-			 alpha,
+		  alpha,
-			 &A[0], // m x k 
+		  As, // m x k 
-			 &B[0], // k x n
+		  Bs, // k x n
-			 beta, 
+		  beta, 
-			 &C[0], // m x n
+		  Cs);
-			 BATCH);
+      synchronise();
    }
    synchronise();
    RealD t1 = usecond();
-    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
+    RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
    flops = 8.0*M*N*K*BATCH*ncall;
    flops = flops/(t1-t0)/1.e3;
    return flops; // Returns gigaflops
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
@@ -279,11 +279,11 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
      diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
      _sort.push(eval2,Nm);
-      //      Glog << "#Ritz value before shift: "<< std::endl;
+      Glog << "#Ritz value before shift: "<< std::endl;
      for(int i=0; i<Nm; ++i){
-	//        std::cout.precision(13);
+	std::cout.precision(13);
-	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      //----------------------------------------------------------------------
@@ -297,7 +297,8 @@ public:
        unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
-        for(int ip=Nk; ip<Nm; ++ip){ 
+        for(int ip=Nk; ip<Nm; ++ip){
 	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
          shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
        }
@@ -325,7 +326,7 @@ public:
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
        _sort.push(eval2,Nk);
-	//        Glog << "#Ritz value after shift: "<< std::endl;
+	Glog << "#Ritz value after shift: "<< std::endl;
        for(int i=0; i<Nk; ++i){
 	  //          std::cout.precision(13);
 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@@ -467,10 +468,10 @@ public:
    // set initial vector
    for (int i=0; i<Nu; ++i) {
-      //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
+      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
      evec[i] = src[i];
      orthogonalize(evec[i],evec,i);
-      //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
+      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
    }
 //    exit(-43);
@@ -506,11 +507,11 @@ public:
      Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
      diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
      _sort.push(eval2,Nr);
-      //      Glog << "#Ritz value: "<< std::endl;
+      Glog << "#Ritz value: "<< std::endl;
      for(int i=0; i<Nr; ++i){
-	//        std::cout.precision(13);
+	std::cout.precision(13);
-	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
      }
      // Convergence test
@@ -570,6 +571,7 @@ public:
      Glog << fname + " NOT converged ; Summary :\n";
    } else {
      Glog << fname + " CONVERGED ; Summary :\n";
      Nstop = Nconv_guess; // Just take them all
      // Sort convered eigenpairs.
      std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating
@@ -642,7 +644,7 @@ private:
      //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
      k_start +=mrhs;
    }
-    //    Glog << "LinAlg "<< std::endl;
+    Glog << "LinAlg "<< std::endl;
    if (b>0) {
      for (int u=0; u<Nu; ++u) {
@@ -676,7 +678,7 @@ private:
      }
      w_copy[u] = w[u];
    }
-    //    Glog << "LinAlg done"<< std::endl;
+    Glog << "LinAlg done"<< std::endl;
    // In block version, the steps 6 and 7 in Lanczos construction is
    // replaced by the QR decomposition of new basis block.
@@ -689,15 +691,15 @@ private:
    }
    // re-orthogonalization for numerical stability
-    //    Glog << "Gram Schmidt"<< std::endl;
+    Glog << "Gram Schmidt"<< std::endl;
    orthogonalize(w,Nu,evec,R);
    // QR part
    for (int u=1; u<Nu; ++u) {
      orthogonalize(w[u],w,u);
    }
-    //    Glog << "Gram Schmidt done "<< std::endl;
+    Glog << "Gram Schmidt done "<< std::endl;
-    //    Glog << "LinAlg "<< std::endl;
+    Glog << "LinAlg "<< std::endl;
    for (int u=0; u<Nu; ++u) {
      //for (int v=0; v<Nu; ++v) {
      for (int v=u; v<Nu; ++v) {
@@ -714,7 +716,7 @@ private:
 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
      }
    }
-    //    Glog << "LinAlg done "<< std::endl;
+    Glog << "LinAlg done "<< std::endl;
    if (b < Nm/Nu-1) {
      for (int u=0; u<Nu; ++u) {
@@ -779,7 +781,7 @@ private:
    for ( int u=0; u<Nu; ++u ) {
      for (int k=0; k<Nk; ++k ) {
-//        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
+	//	Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
        BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
      }
    }
@@ -933,7 +935,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
+    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
    M = Eigen::MatrixXcd::Zero(Nk,Nk);
@@ -951,7 +953,7 @@ if (1){
        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
      }
    }
-    //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; 
+    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
  }
@@ -961,7 +963,7 @@ if (1){
         int Nu, int Nb, int Nk, int Nm,
         Eigen::MatrixXcd& M)
  {
-    //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
+    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
    assert( Nk <= Nm );
@@ -977,7 +979,7 @@ if (1){
        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
      }
    }
-    //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; 
+    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
  }
@@ -986,7 +988,7 @@ if (1){
 		            RealD Dsh,
 		            Eigen::MatrixXcd& Qprod)
  {
-    //Glog << "shiftedQRDecompEigen() begin" << '\n'; 
+    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1002,6 +1004,7 @@ if (1){
                        // lower triangular part used to represent series
                        // of Q sequence.
    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
    // equivalent operation of Qprod *= Q
    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1022,6 +1025,7 @@ if (1){
    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=0; j<Nm-(Nu+1); ++j) {
        for (int k=0; k<Nu+1+j; ++k) {
@@ -1029,6 +1033,7 @@ if (1){
        }
      }
    }
    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      for (int j=Nm-(Nu+1); j<Nm; ++j) {
        for (int k=0; k<Nm; ++k) {
@@ -1036,6 +1041,7 @@ if (1){
        }
      }
    }
    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
    //static int ntimes = 2;
    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1061,11 +1067,13 @@ if (1){
        Mtmp(j,i) = conj(Mtmp(i,j));
      }
    }
    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 
    for (int i=0; i<Nm; ++i) {
      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
    }
    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
    M = Mtmp;
    //M = Q.adjoint()*(M*Q);
@@ -1077,7 +1085,7 @@ if (1){
    //  }
    //}
-    //Glog << "shiftedQRDecompEigen() end" << endl; 
+    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
  }
  void exampleQRDecompEigen(void)
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -499,6 +499,87 @@ namespace Grid {
      }
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, left preconditioned by Mee^inv
  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
  //
  // Solve:
  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
  //
  // Old notation e<->o
  //
  // Left precon by Moo^-1
  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
      const bool _solnAsInitGuess = false)  
    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
      Mtmp=src_o-Mtmp;                 
      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field   sol_e(grid);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -54,6 +54,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
    if ( (_Tp*)ptr == (_Tp *) NULL ) {
      printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -100,6 +103,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
    if ( (_Tp*)ptr == (_Tp *) NULL ) {
      printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -145,6 +151,9 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
    if ( (_Tp*)ptr == (_Tp *) NULL ) {
      printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
    }
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 #if defined(__has_feature)
 #if __has_feature(leak_sanitizer)
 #define ASAN_LEAK_CHECK
 #endif
 #endif
 #ifdef ASAN_LEAK_CHECK
 #include <sanitizer/asan_interface.h>
 #include <sanitizer/common_interface_defs.h>
 #include <sanitizer/lsan_interface.h>
 #define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
 #else
 #define LEAK_CHECK(A) { }
 #endif
 void MemoryManager::DisplayMallinfo(void)
 {
 #ifdef __linux__
  struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
  mi = mallinfo();
  std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl;
  std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl;
  std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl;
  std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl;
  std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl;
  std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl;
  std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
  std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl;
  std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl;
  std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl;
 #endif
  LEAK_CHECK();
 }
 void MemoryManager::PrintBytes(void)
 {
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
-  
+  DisplayMallinfo();
 }
 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -211,6 +211,7 @@ private:
 #endif
 public:
  static void DisplayMallinfo(void);
  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -91,6 +91,7 @@ public:
  ////////////////////////////////////////////////////////////////
  virtual int CheckerBoarded(int dim)=0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerDim(void){ return 0; };
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
  virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@@ -60,6 +60,7 @@ public:
  int              _checker_dim;
  std::vector<int> _checker_board;
  virtual int CheckerDim(void){ return _checker_dim; };
  virtual int CheckerBoarded(int dim){
    if( dim==_checker_dim) return 1;
    else return 0;
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,11 +236,18 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
 #if 1
    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
       me[ss]= r;
      });
 #else    
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 	auto stmp=coalescedRead(vtmp);
 	coalescedWrite(me[ss],stmp);
    });
 #endif    
    me.ViewClose();
    return *this;
  }
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  // Might make all code paths go this way.
 #if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // This code could read coalesce
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v(ss);
 	auto y_l = right_v(ss);
 	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
    });
  }
 #else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
@@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
 #endif
  // This is in single precision and fails some tests
  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
@@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
+  deviceVector<inner_t> inner_tmp;
  inner_tmp.resize(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
-  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
+  static Vector<sobj> mysum;
  mysum.resize(1);
  sobj *mysum_p = & mysum[0];
  sobj identity; zeroit(identity);
  mysum[0] = identity;
  sobj ret ; 
  Integer nsimd= vobj::Nsimd();
-  
+
  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
+    auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
     cgh.parallel_for(cl::sycl::range<1>{osites},
 		      Reduction,
 		      [=] (cl::sycl::id<1> item, auto &sum) {
@@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
   });
  theGridAccelerator->wait();
  ret = mysum[0];
-  free(mysum,*theGridAccelerator);
+  //  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
@@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
  Word xorResult; xorResult = 0;
-  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
+  static Vector<Word> d_sum;
  d_sum.resize(1);
  Word *d_sum_p=&d_sum[0];
  Word identity;  identity=0;
  d_sum[0] = identity;
  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
+    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum ^=vec[index];
+	 sum^=vec[index];
     });
   });
  theGridAccelerator->wait();
  Word ret = d_sum[0];
-  free(d_sum,*theGridAccelerator);
+  //  free(d_sum,*theGridAccelerator);
  return ret;
 }
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <type_traits>
+
 #if defined(GRID_CUDA)
 #include <cub/cub.cuh>
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
 }
 #endif 
-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+
 #if defined(GRID_SYCL)
 template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  size_t subvol_size = e1*e2;
  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
  for (int r = 0; r<rd; r++) { 
    mysum[r] = vobj_zero; 
  }
  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
  // autoView(Data_v, Data, AcceleratorRead);
  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
      int n = s / e2;
      int b = s % e2;
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
  });
  for (int r = 0; r < rd; r++) {
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
          Reduction,
          [=](cl::sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
  }
  theGridAccelerator->wait();
  for (int r = 0; r < rd; r++) {
    lvSum[r] = mysum[r];
  }
  free(mysum,*theGridAccelerator);
 }
 #endif
 template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
@@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 	    buf[ss] = dat[ss*words+w];
    });
-    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #if defined(GRID_CUDA) || defined(GRID_HIP)
-      
+      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
    #elif defined(GRID_SYCL)
      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
    #endif
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 }
-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
+  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
-      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+
      #if defined(GRID_CUDA) || defined(GRID_HIP)
        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
      #elif defined (GRID_SYCL)
        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
      #endif
    }
    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
 #endif
 #if defined(GRID_SYCL)
 template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  typedef typename vobj::scalar_object sobj;
  size_t subvol_size = e1*e2;
  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
  autoView(Data_v, Data, AcceleratorRead);
  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
      int n = s / e2;
      int b = s % e2;
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
  });
  for (int r = 0; r < rd; r++) {
      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
          Reduction,
          [=](cl::sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
      theGridAccelerator->wait();
      lvSum[r] = mysum[0];
  }
  free(mysum,*theGridAccelerator);
 }
 #endif
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
+  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #elif defined(GRID_SYCL)
  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
    assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); 
  }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // remove and insert a half checkerboard
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 {
-  half.Checkerboard() = cb;
+  acceleratorPickCheckerboard(cb,half,full);
  autoView( half_v, half, CpuWrite);
  autoView( full_v, full, CpuRead);
  thread_for(ss, full.Grid()->oSites(),{
    int cbos;
    Coordinate coor;
    full.Grid()->oCoorFromOindex(coor,ss);
    cbos=half.Grid()->CheckerBoard(coor);
    if (cbos==cb) {
      int ssh=half.Grid()->oIndex(coor);
      half_v[ssh] = full_v[ss];
    }
  });
 }
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
 {
-  int cb = half.Checkerboard();
+  acceleratorSetCheckerboard(full,half);
  autoView( half_v , half, CpuRead);
  autoView( full_v , full, CpuWrite);
  thread_for(ss,full.Grid()->oSites(),{
    Coordinate coor;
    int cbos;
    full.Grid()->oCoorFromOindex(coor,ss);
    cbos=half.Grid()->CheckerBoard(coor);
    if (cbos==cb) {
      int ssh=half.Grid()->oIndex(coor);
      full_v[ss]=half_v[ssh];
    }
  });
 }
-template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
+template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
@@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
@@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
    }
  });
 }
-template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
+template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
@@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  int checker_dim_half             = half.Grid()->CheckerDim();
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -90,6 +90,7 @@ public:
        exit(1);
      }
      Parameters.StartingType = arg;
      std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
    }
    if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
@@ -97,6 +98,7 @@ public:
      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg, ivec);
      Parameters.StartTrajectory = ivec[0];
      std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
    }
    if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
@@ -104,6 +106,7 @@ public:
      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg, ivec);
      Parameters.Trajectories = ivec[0];
      std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
    }
    if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
@@ -111,6 +114,7 @@ public:
      std::vector<int> ivec(0);
      GridCmdOptionIntVector(arg, ivec);
      Parameters.NoMetropolisUntil = ivec[0];
      std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
    }
    if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
      arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -137,9 +137,11 @@ public:
      double start_force = usecond();
      MemoryManager::Print();
      as[level].actions.at(a)->deriv_timer_start();
      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();
      MemoryManager::Print();
      auto name = as[level].actions.at(a)->action_name();
@@ -246,7 +248,11 @@ public:
    }
  };
-  virtual ~Integrator() {}
+  virtual ~Integrator()
  {
    // Pain in the ass to clean up the Level pointers
    // Guido's design is at fault as per comment above in constructor
  }
  virtual std::string integrator_name() = 0;
@@ -460,6 +466,7 @@ public:
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	MemoryManager::Print();
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
@@ -468,6 +475,7 @@ public:
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
 	MemoryManager::Print();
      }
      as[level].apply(S_hireps, Representations, level, H);
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -32,7 +32,9 @@ private:
  //  Smear_Stout<Gimpl> *StoutSmearing;
  //  std::vector<GaugeField> SmearedSet;
  GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
  std::vector<LatticeLorentzComplex> masks;
  std::vector<int> cbs;
  typedef typename SU3Adjoint::AMatrix AdjMatrix;
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
@@ -147,6 +149,25 @@ private:
    }
    pokeLorentz(Fdet, Fdet_pol, nu);
  }
  void Compute_MpInvJx_dNxxdSy(int cb,
 			       const GaugeLinkField &PlaqL,
 			       const GaugeLinkField &PlaqR,
 			       AdjMatrixField MpInvJx,
 			       AdjVectorField &Fdet2 )
  {
    GaugeLinkField PlaqLeo(UrbGrid);
    GaugeLinkField PlaqReo(UrbGrid);
    AdjMatrixField MpInvJxeo(UrbGrid);
    AdjVectorField Fdet2eo(UrbGrid);
    pickCheckerboard(cb,PlaqLeo,PlaqL);
    pickCheckerboard(cb,PlaqReo,PlaqR);
    pickCheckerboard(cb,MpInvJxeo,MpInvJx);
    Fdet2eo.Checkerboard()=cb;
    Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
    setCheckerboard(Fdet2,Fdet2eo);
  }
  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
  {
    GaugeLinkField UtaU(PlaqL.Grid());
@@ -278,8 +299,9 @@ public:
    ////////////////////////////////////////////////////////////////////////////////
    // Mask the gauge field
    ////////////////////////////////////////////////////////////////////////////////
    int cb = cbs[smr];
    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
-
+    
    Umsk = U;
    ApplyMask(Umsk,smr);
    Utmp = peekLorentz(Umsk,mu);
@@ -442,7 +464,7 @@ public:
    AdjMatrixField MpInvJx_nu(grid);
    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
-    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+    Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
    Fdet2_mu=FdetV;
    Fdet1_mu=Zero();
@@ -499,7 +521,7 @@ public:
 	time=-usecond();
 	PlaqR=(-1.0)*PlaqR;
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
 	Fdet2_nu = FdetV;
 	time+=usecond();
 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
@@ -520,7 +542,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	///////////////// -ve nu /////////////////
@@ -539,7 +561,7 @@ public:
 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	// x==
@@ -560,7 +582,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_nu = Fdet2_nu+FdetV;
 	/////////////////////////////////////////////////////////////////////
@@ -589,7 +611,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
 	//  __
@@ -609,7 +631,7 @@ public:
 	MpInvJx_nu = Cshift(MpInvJx,nu,1);
-	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
+	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 	Fdet2_mu = Fdet2_mu+FdetV;
      }
@@ -931,6 +953,10 @@ private:
 public:
  /* Standard constructor */
  virtual ~SmearedConfigurationMasked()
  {
    delete UrbGrid;
  }
  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
  {
@@ -939,7 +965,6 @@ public:
    // was resized in base class
    assert(this->SmearedSet.size()==Nsmear);
    GridRedBlackCartesian * UrbGrid;
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
    LatticeComplex tmp(_UGrid);
@@ -947,10 +972,11 @@ public:
    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
      int mu= (i/2) %Nd;
      int cb= (i%2);
      LatticeComplex tmpcb(UrbGrid);
      cbs.push_back(cb);
      masks[i]=Zero();
      ////////////////////
@@ -962,7 +988,6 @@ public:
      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
    }
    delete UrbGrid;
  }
  virtual void smeared_force(GaugeField &SigmaTilde) 
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
  int hNNm1= NNm1/2;
  RealD sqrt_2 = sqrt(2.0);
  Complex ci(0.0,1.0);
-  for(int su2Index=0;su2Index<hNNm1;su2Index++){
+
-    int i1, i2;
+  const int nsimd=  Matrix::Nsimd();
-    su2SubGroupIndex(i1, i2, su2Index);
+  accelerator_for(ss,grid->oSites(),nsimd,{
-    int ax = su2Index*2;
+      for(int su2Index=0;su2Index<hNNm1;su2Index++){
-    int ay = su2Index*2+1;
+	int i1, i2;
-    accelerator_for(ss,grid->oSites(),1,{
+	su2SubGroupIndex(i1, i2, su2Index);
 	int ax = su2Index*2;
 	int ay = su2Index*2+1;
 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
 	// trace( Ta x Ci in)
 	// Bet I need to move to real part with mult by -i
-	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
+	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
-	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
+	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
-      });
+      }
-  }
+      for(int diagIndex=0;diagIndex<N-1;diagIndex++){
-  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
+	int k = diagIndex + 1; // diagIndex starts from 0
-    int k = diagIndex + 1; // diagIndex starts from 0
+	int a = NNm1+diagIndex;
-    int a = NNm1+diagIndex;
+	RealD scale = 1.0/sqrt(2.0*k*(k+1));
-    RealD scale = 1.0/sqrt(2.0*k*(k+1));
+	auto tmp = in_v(ss)()()(0,0);
    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
 	auto tmp = in_v[ss]()()(0,0);
 	for(int i=1;i<k;i++){
-	  tmp=tmp+in_v[ss]()()(i,i);
+	  tmp=tmp+in_v(ss)()()(i,i);
 	}
-	tmp = tmp - in_v[ss]()()(k,k)*k;
+	tmp = tmp - in_v(ss)()()(k,k)*k;
-	out_v[ss]()()(a,b) =imag(tmp) * scale;
+	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
-      });
+      }
-    }
+    });
 }
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
 ////////////////////////////////////////////////////////////////////////
 // Map a su2 subgroup number to the pair of rows that are non zero
 ////////////////////////////////////////////////////////////////////////
-static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
+static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
  int spare = su2_index;
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -99,6 +99,8 @@ using std::log;
 using std::exp;
 using std::sin;
 using std::cos;
 using std::asin;
 using std::acos;
 accelerator_inline RealF    conjugate(const RealF  & r){ return r; }
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {
 NAMESPACE_END(Grid);
 #ifdef GRID_SYCL
 template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
 template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
 template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
 #endif
 inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 {
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
  acceleratorCopySynchronise();
 }
 //////////////////////////////////////////////
 // CPU Target - No accelerator just thread instead
 //////////////////////////////////////////////
@@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 #undef GRID_SIMT
 inline void acceleratorMem(void)
 {
  /*
@@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void)
  return;
 }
 inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 {
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
  acceleratorCopySynchronise();
 }
 template<class T> void acceleratorPut(T& dev,T&host)
 {
  acceleratorCopyToDevice(&host,&dev,sizeof(T));
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -0,0 +1,238 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: HMC/ComputeWilsonFlow.cc
 Copyright (C) 2017
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Shuhei Yamamoto <syamamoto@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <string>
 namespace Grid{
  struct WFParameters: Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
            int, steps,
            double, step_size,
            int, meas_interval,
 	    double, maxTau, // for the adaptive algorithm
 	    int, meas_interval_density,
 	    std::string, path); 
    template <class ReaderClass >
    WFParameters(Reader<ReaderClass>& Reader){
      read(Reader, "WilsonFlow", *this);
    }
  };
  struct ConfParameters: Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters,
 	   std::string, conf_path,
           std::string, conf_prefix,
 	   std::string, conf_smr_prefix,
           std::string, rng_prefix,
 	   int, StartConfiguration,
 	   int, EndConfiguration,
           int, Skip);
    template <class ReaderClass >
    ConfParameters(Reader<ReaderClass>& Reader){
      read(Reader, "Configurations", *this);
    }
  };
 }
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  GridLogLayout();
  auto latt_size   = GridDefaultLatt();
  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
  auto mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size, simd_layout, mpi_layout);
  std::vector<int> seeds({1, 2, 3, 4, 5});
  GridSerialRNG sRNG;
  GridParallelRNG pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
  LatticeGaugeField Umu(&Grid), Uflow(&Grid);
  typedef Grid::XmlReader       Serialiser;
  Serialiser Reader("input.xml", false, "root");
  WFParameters WFPar(Reader);
  ConfParameters CPar(Reader);
  CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix);
  NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar);
  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  std::string file_pre  = WFPar.path;
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
    typedef typename PeriodicGimplR::ComplexField ComplexField;
    assert(Nd == 4);
    // NOTE:
    // Ideally, turn the folloing into methods of the appropriate class
    /////////////   Compute Energy Density via Clover Leaf    /////////////////////////////////////////////////
    ///// Taken from qcd/smearing/WilsonFlow.h
    //         For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3
    //E = 1/2 tr( F_munu F_munu )
    //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
    //F_01 F_02 F_03   F_12 F_13  F_23
    GaugeMat F(U.Grid());
    //LatticeComplexD R(U.Grid());
    ComplexField R(U.Grid());
    R = Zero();
    for(int mu=0;mu<3;mu++){
      for(int nu=mu+1;nu<4;nu++){
 	WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu);
 	R = R + trace(F*F);
      }
    }
    R = (-1.0) * R;
    //// Taken from qcd/utils/WilsonLoops.h
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
    GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
    WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir);
    WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir);
    WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir);
    // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
    GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
    WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir);
    WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir);
    WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir);
    double coeff = 8.0/(32.0*M_PI*M_PI);
    ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
    //ComplexField qfield Plq(U.Grid());
    //WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U);
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;
    int tau = std::round(t);
    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
    writeFile(R,efile);
    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
    writeFile(qfield,tfile);
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
 	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
  });
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
  const double pl_adm = 1.0-sp_adm/Nc;
  std::cout << GridLogMessage << "   (pl_adm =" << pl_adm << ")\n";
  // Need min and reduce min for this function
  //double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm));
  double sp_ave = Nc*(1.0-WFlow_plaq);
  //std::cout<< GridLogMessage << "   sp_max = "        << sp_max <<"\n";
  std::cout<< GridLogMessage << "   sp_ave = "        << sp_ave <<"\n";
  std::cout<< GridLogMessage << "   (sp_admissible = "<< sp_adm <<")\n";
  //std::cout<< GridLogMessage << "   sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n";
  std::cout<< GridLogMessage << "   sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n";
  }
  Grid_finalize();
 }  // main
 /*
 Input file example
 JSON
 {
    "WilsonFlow":{
 	"steps": 200,
 	"step_size": 0.01,
 	"meas_interval": 50,
  "maxTau": 2.0
    },
    "Configurations":{
 	"conf_prefix": "ckpoint_lat",
 	"rng_prefix": "ckpoint_rng",
 	"StartConfiguration": 3000,
 	"EndConfiguration": 3000,
 	"Skip": 5
    }
 }
 */
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -58,7 +58,7 @@ int main(int argc, char **argv) {
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
-  HMCparams.NoMetropolisUntil=  20;
+  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.MD = MD;
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 10;
+  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@@ -186,6 +186,8 @@ int main(int argc, char **argv) {
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                     
  TheHMC.initializeGaugeFieldAndRNGs(U);
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
--- a/HMC/site_autocorrelation.cc
+++ b/HMC/site_autocorrelation.cc
@@ -30,11 +30,13 @@ directory
 #include <string>
 template <class T> void readFile(T& out, std::string const fname){
 #ifdef HAVE_LIME
  Grid::emptyUserRecord record;
  Grid::ScidacReader RD;
  RD.open(fname);
  RD.readScidacFieldRecord(out,record);
  RD.close();
 #endif
 }
--- a/HMC/site_plaquette.cc
+++ b/HMC/site_plaquette.cc
@@ -31,11 +31,13 @@ directory
 NAMESPACE_BEGIN(Grid);
 template <class T> void writeFile(T& out, std::string const fname){
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(out.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
  WR.close();
 #endif
 }
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -261,23 +261,25 @@ public:
    fprintf(FP,"\n\n");
  };
-
+  template<class CComplex>
  static void BLAS(void)
  {
    //int nbasis, int nrhs, int coarseVol
    int  basis[] = { 16,32,64 };
-    int  rhs[]   = { 8,16,32 };
+    int  rhs[]   = { 8,12,16 };
-    int  vol  = 4*4*4*4;
+    int  vol  = 8*8*8*8;
    int  blk  = 4*4*4*4;
    GridBLAS blas;
-    
+
    int fpbits = sizeof(CComplex)*4;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
+    std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
+    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
@@ -285,7 +287,7 @@ public:
      int N=rhs[r];
      int K=basis[b];
      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
+      double p=blas.benchmark<CComplex>(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
@@ -299,9 +301,9 @@ public:
    for(int r=0;r<3;r++){
      int M=basis[b];
      int N=rhs[r];
-      int K=vol;
+      int K=blk;
      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
+      double p=blas.benchmark<CComplex>(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
@@ -313,10 +315,10 @@ public:
    for(int b=0;b<3;b++){
    for(int r=0;r<3;r++){
      int M=rhs[r];
-      int N=vol;
+      int N=blk;
      int K=basis[b];
      int BATCH=vol;
-      double p=blas.benchmark(M,N,K,BATCH);
+      double p=blas.benchmark<CComplex>(M,N,K,BATCH);
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
      std::cout<<GridLogMessage<<std::setprecision(3) 
@@ -867,6 +869,7 @@ int main (int argc, char ** argv)
  int do_memory=1;
  int do_comms =1;
  int do_blas  =1;
  int do_dslash=1;
  int sel=4;
  std::vector<int> L_list({8,12,16,24,32});
@@ -877,6 +880,7 @@ int main (int argc, char ** argv)
  std::vector<double> staggered;
  int Ls=1;
  if (do_dslash){
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -901,6 +905,7 @@ int main (int argc, char ** argv)
    staggered.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -909,8 +914,33 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  int NN=NN_global;
  if(do_dslash){
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
    fprintf(FP,"Per node summary table\n");
    fprintf(FP,"\n");
    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
    fprintf(FP,"\n");
    for(int l=0;l<L_list.size();l++){
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
    }
    fprintf(FP,"\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
    std::cout<<std::setprecision(3);
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
@@ -918,15 +948,6 @@ int main (int argc, char ** argv)
    Benchmark::Memory();
  }
  if ( do_blas ) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::BLAS();
 #endif
  }
  if ( do_su4 ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
@@ -941,28 +962,14 @@ int main (int argc, char ** argv)
    Benchmark::Comms();
  }
  if ( do_blas ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
+    Benchmark::BLAS<ComplexD>();
-    fprintf(FP,"Per node summary table\n");
+    Benchmark::BLAS<ComplexF>();
-    fprintf(FP,"\n");
+  }
-    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
+  
    fprintf(FP,"\n");
    for(int l=0;l<L_list.size();l++){
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
    }
    fprintf(FP,"\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
    std::cout<<std::setprecision(3);
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  Grid_finalize();
  fclose(FP);
 }
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,16 +1,18 @@
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
-	CXX=icpx \
+	CXX=icpx 
 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
--- a/systems/Aurora/config-command-leak
+++ b/systems/Aurora/config-command-leak
@@ -0,0 +1,23 @@
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
 export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 ../../configure \
 	--enable-debug \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl  -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host  -fsycl -fsanitize=leak "
--- a/systems/Aurora/config-command-sanitize
+++ b/systems/Aurora/config-command-sanitize
@@ -0,0 +1,22 @@
 # -fsycl-targets=spir64_gen -Xs\" -device pvc \"
 # -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
 # -fsycl-targets=intel_gpu_pvc
 unset DEVICE
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx 
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -1,14 +1,22 @@
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 #spack load libefence
 #export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
 #export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
 #spack load gperftools
 export TCMALLOC=/home/paboyle/gperftools/install
 export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 export INTELGT_AUTO_ATTACH_DISABLE=1
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
-
+#module load oneapi/release/2023.12.15.001
 module load oneapi/release/2023.12.15.001
 #module use /soft/modulefiles
 #module load intel_compute_runtime/release/agama-devel-682.22
-export FI_CXI_DEFAULT_CQ_SIZE=131072
+#export FI_CXI_DEFAULT_CQ_SIZE=131072
-export FI_CXI_CQ_FILL_PERCENT=20
+#export FI_CXI_CQ_FILL_PERCENT=20
-
+#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
 #
@@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 # -ftarget-register-alloc-mode=pvc:small
 # -ftarget-register-alloc-mode=pvc:large
 # -ftarget-register-alloc-mode=pvc:auto
-#
+#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 #source ~/spack/share/spack/setup-env.sh
 #spack load gperftools
 #export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
 #export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
--- a/systems/Frontier/benchmarks/Benchmark_usqcd.csv
+++ b/systems/Frontier/benchmarks/Benchmark_usqcd.csv
@@ -0,0 +1,76 @@
 Memory Bandwidth
 Bytes, GB/s per node
 6291456, 379.297050
 100663296, 3754.674992
 509607936, 6521.472413
 1610612736, 8513.456479
 3932160000, 9018.901766
 GEMM
 M, N, K, BATCH, GF/s per rank
 16, 8, 16, 256, 0.564958
 16, 16, 16, 256, 243.148058
 16, 32, 16, 256, 440.346877
 32, 8, 32, 256, 439.194136
 32, 16, 32, 256, 847.334141
 32, 32, 32, 256, 1430.892623
 64, 8, 64, 256, 1242.756741
 64, 16, 64, 256, 2196.689493
 64, 32, 64, 256, 3697.458072
 16, 8, 256, 256, 899.582627
 16, 16, 256, 256, 1673.537756
 16, 32, 256, 256, 2959.597089
 32, 8, 256, 256, 1558.858630
 32, 16, 256, 256, 2864.839445
 32, 32, 256, 256, 4810.671254
 64, 8, 256, 256, 2386.092942
 64, 16, 256, 256, 4451.665937
 64, 32, 256, 256, 5942.124095
 8, 256, 16, 256, 799.867271
 16, 256, 16, 256, 1584.624888
 32, 256, 16, 256, 1949.422338
 8, 256, 32, 256, 1389.417474
 16, 256, 32, 256, 2668.344493
 32, 256, 32, 256, 3234.162120
 8, 256, 64, 256, 2150.925128
 16, 256, 64, 256, 4012.488132
 32, 256, 64, 256, 5154.785521
 Communications
 Packet bytes, direction, GB/s per node
 4718592, 1, 245.026198
 4718592, 2, 251.180996
 4718592, 3, 361.110977
 4718592, 5, 247.898447
 4718592, 6, 249.867523
 4718592, 7, 359.033061
 15925248, 1, 255.030946
 15925248, 2, 264.453890
 15925248, 3, 392.949183
 15925248, 5, 256.040644
 15925248, 6, 264.681896
 15925248, 7, 392.102622
 37748736, 1, 258.823333
 37748736, 2, 268.181577
 37748736, 3, 401.478191
 37748736, 5, 258.995363
 37748736, 6, 268.206586
 37748736, 7, 400.397611
 Per node summary table
 L , Wilson, DWF4, Staggered, GF/s per node
 8 , 155, 1386, 50
 12 , 694, 4208, 230
 16 , 1841, 6675, 609
 24 , 3934, 8573, 1641
 32 , 5083, 9771, 3086
--- a/systems/Frontier/benchmarks/Benchmark_usqcd.log
+++ b/systems/Frontier/benchmarks/Benchmark_usqcd.log
@@ -0,0 +1,702 @@
 RANK 1 using GPU 1
 RANK 5 using GPU 6
 RANK 0 using GPU 0
 RANK 2 using GPU 2
 RANK 3 using GPU 3
 RANK 6 using GPU 5
 RANK 7 using GPU 4
 RANK 4 using GPU 7
 world_rank 0 has 1 devices
 AcceleratorHipInit: ========================
 AcceleratorHipInit: Device Number    : 0
 AcceleratorHipInit: ========================
 AcceleratorHipInit: Device identifier: AMD Instinct MI250X
 AcceleratorHipInit:   totalGlobalMem: 68702699520 
 AcceleratorHipInit:   isMultiGpuBoard: 0 
 AcceleratorHipInit:   warpSize: 64 
 AcceleratorHipInit: using default device 
 AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding 
 AcceleratorHipInit: Configure options --enable-setdevice=no 
 local rank 0 device 0 bus id: 0000:c1:00.0
 AcceleratorHipInit: ================================================
 SharedMemoryMpi:  World communicator of size 8
 SharedMemoryMpi:  Node  communicator of size 8
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : This rank is running on host frontier01320
 Grid : Message : Requested 4294967296 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 54962159616 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
 Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using hipMalloc
 Grid : Message : 0.293720 s : ==================================================================================
 Grid : Message : 0.293790 s : = Grid is setup to use 1 threads
 Grid : Message : 0.293800 s : ==================================================================================
 Grid : Message : 0.293810 s : Grid Default Decomposition patterns
 Grid : Message : 0.293810 s : 	OpenMP threads : 1
 Grid : Message : 0.293820 s : 	MPI tasks      : 1 2 2 2 
 Grid : Message : 0.293870 s : 	vReal          : 512bits ; 1 2 2 2 
 Grid : Message : 0.293890 s : 	vRealF         : 512bits ; 2 2 2 2 
 Grid : Message : 0.293910 s : 	vRealD         : 512bits ; 1 2 2 2 
 Grid : Message : 0.293920 s : 	vComplex       : 512bits ; 1 1 2 2 
 Grid : Message : 0.293930 s : 	vComplexF      : 512bits ; 1 2 2 2 
 Grid : Message : 0.293960 s : 	vComplexD      : 512bits ; 1 1 2 2 
 Grid : Message : 0.293970 s : ==================================================================================
 Grid : Message : 0.293980 s : ==================================================================================
 Grid : Message : 0.293990 s :  Clover dslash 4D vectorised (temporarily Wilson)
 Grid : Message : 0.294000 s : ==================================================================================
 Grid : Message : 0.301330 s : ==================================================================================
 Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume 
 Grid : Message : 0.301370 s : * Nc             : 3
 Grid : Message : 0.301380 s : * Global volume  : 8 16 16 16 
 Grid : Message : 0.301410 s : * Ls             : 1
 Grid : Message : 0.301420 s : * ranks          : 8
 Grid : Message : 0.301430 s : * nodes          : 1
 Grid : Message : 0.301440 s : * ranks/node     : 8
 Grid : Message : 0.301450 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 0.301460 s : * Using 1 threads
 Grid : Message : 0.301470 s : ==================================================================================
 Grid : Message : 0.345030 s : Initialised RNGs
 Grid : Message : 0.158302 s : ==================================================================================
 Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute
 Grid : Message : 0.158312 s : * SINGLE precision 
 Grid : Message : 0.158313 s : ==================================================================================
 Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344
 Grid : Message : 0.240711 s : Deo mflop/s =   154914.0 (130.8) 139367.7-159565.9
 Grid : Message : 0.240715 s : Deo mflop/s per rank   19364.3
 Grid : Message : 0.240716 s : Deo mflop/s per node   154914.0
 Grid : Message : 0.240718 s : ==================================================================================
 Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels
 Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute
 Grid : Message : 0.240719 s : * SINGLE precision 
 Grid : Message : 0.240719 s : ==================================================================================
 Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 0.315033 s : Deo mflop/s =   151459.5 (142.0) 131856.9-157286.4
 Grid : Message : 0.315036 s : Deo mflop/s per rank   18932.4
 Grid : Message : 0.315037 s : Deo mflop/s per node   151459.5
 Grid : Message : 0.315038 s : ==================================================================================
 Grid : Message : 0.315040 s : 8^4 x 1 Deo Best  mflop/s        =   154914.0 ; 154914.0 per node 
 Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s        =   151459.5 ; 151459.5 per node 
 Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ; 
 Grid : Message : 0.315044 s : ==================================================================================
 Grid : Message : 0.316507 s : ==================================================================================
 Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume 
 Grid : Message : 0.316511 s : * Nc             : 3
 Grid : Message : 0.316512 s : * Global volume  : 12 24 24 24 
 Grid : Message : 0.316515 s : * Ls             : 1
 Grid : Message : 0.316516 s : * ranks          : 8
 Grid : Message : 0.316517 s : * nodes          : 1
 Grid : Message : 0.316518 s : * ranks/node     : 8
 Grid : Message : 0.316518 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 0.316519 s : * Using 1 threads
 Grid : Message : 0.316520 s : ==================================================================================
 Grid : Message : 0.327883 s : Initialised RNGs
 Grid : Message : 0.786395 s : ==================================================================================
 Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute
 Grid : Message : 0.786406 s : * SINGLE precision 
 Grid : Message : 0.786406 s : ==================================================================================
 Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 0.871659 s : Deo mflop/s =   684982.2 (632.4) 609162.5-714594.5
 Grid : Message : 0.871663 s : Deo mflop/s per rank   85622.8
 Grid : Message : 0.871664 s : Deo mflop/s per node   684982.2
 Grid : Message : 0.871665 s : ==================================================================================
 Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels
 Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute
 Grid : Message : 0.871665 s : * SINGLE precision 
 Grid : Message : 0.871665 s : ==================================================================================
 Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 0.953702 s : Deo mflop/s =   693556.6 (576.5) 663552.0-719204.7
 Grid : Message : 0.953705 s : Deo mflop/s per rank   86694.6
 Grid : Message : 0.953706 s : Deo mflop/s per node   693556.6
 Grid : Message : 0.953707 s : ==================================================================================
 Grid : Message : 0.953708 s : 12^4 x 1 Deo Best  mflop/s        =   693556.6 ; 693556.6 per node 
 Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s        =   684982.2 ; 684982.2 per node 
 Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ; 
 Grid : Message : 0.953713 s : ==================================================================================
 Grid : Message : 0.957609 s : ==================================================================================
 Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume 
 Grid : Message : 0.957614 s : * Nc             : 3
 Grid : Message : 0.957615 s : * Global volume  : 16 32 32 32 
 Grid : Message : 0.957620 s : * Ls             : 1
 Grid : Message : 0.957621 s : * ranks          : 8
 Grid : Message : 0.957622 s : * nodes          : 1
 Grid : Message : 0.957623 s : * ranks/node     : 8
 Grid : Message : 0.957623 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 0.957624 s : * Using 1 threads
 Grid : Message : 0.957625 s : ==================================================================================
 Grid : Message : 0.985828 s : Initialised RNGs
 Grid : Message : 2.379761 s : ==================================================================================
 Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute
 Grid : Message : 2.379774 s : * SINGLE precision 
 Grid : Message : 2.379775 s : ==================================================================================
 Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 2.486725 s : Deo mflop/s =   1803226.1 (1139.4) 1646362.3-1864135.1
 Grid : Message : 2.486729 s : Deo mflop/s per rank   225403.3
 Grid : Message : 2.486731 s : Deo mflop/s per node   1803226.1
 Grid : Message : 2.486732 s : ==================================================================================
 Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels
 Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute
 Grid : Message : 2.486732 s : * SINGLE precision 
 Grid : Message : 2.486732 s : ==================================================================================
 Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 2.584412 s : Deo mflop/s =   1840587.3 (1119.6) 1779401.7-1914791.0
 Grid : Message : 2.584415 s : Deo mflop/s per rank   230073.4
 Grid : Message : 2.584416 s : Deo mflop/s per node   1840587.3
 Grid : Message : 2.584417 s : ==================================================================================
 Grid : Message : 2.584418 s : 16^4 x 1 Deo Best  mflop/s        =   1840587.3 ; 1840587.3 per node 
 Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s        =   1803226.1 ; 1803226.1 per node 
 Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ; 
 Grid : Message : 2.584423 s : ==================================================================================
 Grid : Message : 2.592858 s : ==================================================================================
 Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume 
 Grid : Message : 2.592863 s : * Nc             : 3
 Grid : Message : 2.592864 s : * Global volume  : 24 48 48 48 
 Grid : Message : 2.592869 s : * Ls             : 1
 Grid : Message : 2.592870 s : * ranks          : 8
 Grid : Message : 2.592871 s : * nodes          : 1
 Grid : Message : 2.592872 s : * ranks/node     : 8
 Grid : Message : 2.592872 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 2.592873 s : * Using 1 threads
 Grid : Message : 2.592874 s : ==================================================================================
 Grid : Message : 2.715623 s : Initialised RNGs
 Grid : Message : 9.608838 s : ==================================================================================
 Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute
 Grid : Message : 9.608854 s : * SINGLE precision 
 Grid : Message : 9.608855 s : ==================================================================================
 Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 9.870309 s : Deo mflop/s =   3861903.3 (1708.9) 3511078.3-3937368.2
 Grid : Message : 9.870313 s : Deo mflop/s per rank   482737.9
 Grid : Message : 9.870314 s : Deo mflop/s per node   3861903.3
 Grid : Message : 9.870315 s : ==================================================================================
 Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels
 Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute
 Grid : Message : 9.870317 s : * SINGLE precision 
 Grid : Message : 9.870317 s : ==================================================================================
 Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 10.101624 s : Deo mflop/s =   3933599.5 (1412.7) 3835758.7-4008152.3
 Grid : Message : 10.101627 s : Deo mflop/s per rank   491699.9
 Grid : Message : 10.101628 s : Deo mflop/s per node   3933599.5
 Grid : Message : 10.101629 s : ==================================================================================
 Grid : Message : 10.101629 s : 24^4 x 1 Deo Best  mflop/s        =   3933599.5 ; 3933599.5 per node 
 Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s        =   3861903.3 ; 3861903.3 per node 
 Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ; 
 Grid : Message : 10.101634 s : ==================================================================================
 Grid : Message : 10.139642 s : ==================================================================================
 Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume 
 Grid : Message : 10.139653 s : * Nc             : 3
 Grid : Message : 10.139654 s : * Global volume  : 32 64 64 64 
 Grid : Message : 10.139661 s : * Ls             : 1
 Grid : Message : 10.139661 s : * ranks          : 8
 Grid : Message : 10.139662 s : * nodes          : 1
 Grid : Message : 10.139662 s : * ranks/node     : 8
 Grid : Message : 10.139662 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 10.139663 s : * Using 1 threads
 Grid : Message : 10.139663 s : ==================================================================================
 Grid : Message : 10.502161 s : Initialised RNGs
 Grid : Message : 32.211092 s : ==================================================================================
 Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute
 Grid : Message : 32.211109 s : * SINGLE precision 
 Grid : Message : 32.211110 s : ==================================================================================
 Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 32.841732 s : Deo mflop/s =   4988499.9 (2722.5) 4244837.8-5120022.3
 Grid : Message : 32.841736 s : Deo mflop/s per rank   623562.5
 Grid : Message : 32.841737 s : Deo mflop/s per node   4988499.9
 Grid : Message : 32.841738 s : ==================================================================================
 Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels
 Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute
 Grid : Message : 32.841740 s : * SINGLE precision 
 Grid : Message : 32.841740 s : ==================================================================================
 Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 33.407442 s : Deo mflop/s =   5082758.0 (1883.1) 4971027.0-5205119.6
 Grid : Message : 33.407446 s : Deo mflop/s per rank   635344.7
 Grid : Message : 33.407447 s : Deo mflop/s per node   5082758.0
 Grid : Message : 33.407448 s : ==================================================================================
 Grid : Message : 33.407448 s : 32^4 x 1 Deo Best  mflop/s        =   5082758.0 ; 5082758.0 per node 
 Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s        =   4988499.9 ; 4988499.9 per node 
 Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ; 
 Grid : Message : 33.407453 s : ==================================================================================
 Grid : Message : 33.506785 s : ==================================================================================
 Grid : Message : 33.506798 s :  Domain wall dslash 4D vectorised
 Grid : Message : 33.506799 s : ==================================================================================
 Grid : Message : 33.530686 s : ==================================================================================
 Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume 
 Grid : Message : 33.530690 s : * Nc             : 3
 Grid : Message : 33.530691 s : * Global volume  : 8 16 16 16 
 Grid : Message : 33.530698 s : * Ls             : 12
 Grid : Message : 33.530699 s : * ranks          : 8
 Grid : Message : 33.530700 s : * nodes          : 1
 Grid : Message : 33.530701 s : * ranks/node     : 8
 Grid : Message : 33.530702 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 33.530703 s : * Using 1 threads
 Grid : Message : 33.530704 s : ==================================================================================
 Grid : Message : 33.545465 s : Initialised RNGs
 Grid : Message : 33.752384 s : ==================================================================================
 Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute
 Grid : Message : 33.752399 s : * SINGLE precision 
 Grid : Message : 33.752400 s : ==================================================================================
 Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 33.851977 s : Deo mflop/s =   1383287.7 (849.8) 1321205.8-1420651.4
 Grid : Message : 33.851981 s : Deo mflop/s per rank   172911.0
 Grid : Message : 33.851983 s : Deo mflop/s per node   1383287.7
 Grid : Message : 33.851984 s : ==================================================================================
 Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels
 Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute
 Grid : Message : 33.851984 s : * SINGLE precision 
 Grid : Message : 33.851984 s : ==================================================================================
 Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 33.949240 s : Deo mflop/s =   1386335.8 (734.6) 1341325.6-1428330.6
 Grid : Message : 33.949243 s : Deo mflop/s per rank   173292.0
 Grid : Message : 33.949244 s : Deo mflop/s per node   1386335.8
 Grid : Message : 33.949245 s : ==================================================================================
 Grid : Message : 33.949245 s : 8^4 x 12 Deo Best  mflop/s        =   1386335.8 ; 1386335.8 per node 
 Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s        =   1383287.7 ; 1383287.7 per node 
 Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ; 
 Grid : Message : 33.949250 s : ==================================================================================
 Grid : Message : 33.952789 s : ==================================================================================
 Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume 
 Grid : Message : 33.952794 s : * Nc             : 3
 Grid : Message : 33.952795 s : * Global volume  : 12 24 24 24 
 Grid : Message : 33.952800 s : * Ls             : 12
 Grid : Message : 33.952801 s : * ranks          : 8
 Grid : Message : 33.952802 s : * nodes          : 1
 Grid : Message : 33.952803 s : * ranks/node     : 8
 Grid : Message : 33.952803 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 33.952804 s : * Using 1 threads
 Grid : Message : 33.952805 s : ==================================================================================
 Grid : Message : 34.362200 s : Initialised RNGs
 Grid : Message : 34.969821 s : ==================================================================================
 Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute
 Grid : Message : 34.969834 s : * SINGLE precision 
 Grid : Message : 34.969835 s : ==================================================================================
 Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 35.135558 s : Deo mflop/s =   4208495.6 (2165.0) 4053699.5-4315228.5
 Grid : Message : 35.135562 s : Deo mflop/s per rank   526062.0
 Grid : Message : 35.135563 s : Deo mflop/s per node   4208495.6
 Grid : Message : 35.135564 s : ==================================================================================
 Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels
 Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute
 Grid : Message : 35.135565 s : * SINGLE precision 
 Grid : Message : 35.135565 s : ==================================================================================
 Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 35.299715 s : Deo mflop/s =   4156968.7 (1450.2) 4053699.5-4219939.5
 Grid : Message : 35.299718 s : Deo mflop/s per rank   519621.1
 Grid : Message : 35.299719 s : Deo mflop/s per node   4156968.7
 Grid : Message : 35.299721 s : ==================================================================================
 Grid : Message : 35.299721 s : 12^4 x 12 Deo Best  mflop/s        =   4208495.6 ; 4208495.6 per node 
 Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s        =   4156968.7 ; 4156968.7 per node 
 Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ; 
 Grid : Message : 35.299726 s : ==================================================================================
 Grid : Message : 35.309687 s : ==================================================================================
 Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume 
 Grid : Message : 35.309694 s : * Nc             : 3
 Grid : Message : 35.309695 s : * Global volume  : 16 32 32 32 
 Grid : Message : 35.309701 s : * Ls             : 12
 Grid : Message : 35.309702 s : * ranks          : 8
 Grid : Message : 35.309703 s : * nodes          : 1
 Grid : Message : 35.309704 s : * ranks/node     : 8
 Grid : Message : 35.309704 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 35.309705 s : * Using 1 threads
 Grid : Message : 35.309706 s : ==================================================================================
 Grid : Message : 35.448780 s : Initialised RNGs
 Grid : Message : 38.468764 s : ==================================================================================
 Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute
 Grid : Message : 38.468779 s : * SINGLE precision 
 Grid : Message : 38.468780 s : ==================================================================================
 Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 38.801040 s : Deo mflop/s =   6674673.6 (2168.6) 6484445.4-6797200.1
 Grid : Message : 38.801044 s : Deo mflop/s per rank   834334.2
 Grid : Message : 38.801045 s : Deo mflop/s per node   6674673.6
 Grid : Message : 38.801046 s : ==================================================================================
 Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels
 Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute
 Grid : Message : 38.801049 s : * SINGLE precision 
 Grid : Message : 38.801049 s : ==================================================================================
 Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 39.129783 s : Deo mflop/s =   6560128.4 (2117.4) 6405846.1-6679081.3
 Grid : Message : 39.129786 s : Deo mflop/s per rank   820016.1
 Grid : Message : 39.129787 s : Deo mflop/s per node   6560128.4
 Grid : Message : 39.129788 s : ==================================================================================
 Grid : Message : 39.129788 s : 16^4 x 12 Deo Best  mflop/s        =   6674673.6 ; 6674673.6 per node 
 Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s        =   6560128.4 ; 6560128.4 per node 
 Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ; 
 Grid : Message : 39.129795 s : ==================================================================================
 Grid : Message : 39.161251 s : ==================================================================================
 Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume 
 Grid : Message : 39.161266 s : * Nc             : 3
 Grid : Message : 39.161267 s : * Global volume  : 24 48 48 48 
 Grid : Message : 39.161274 s : * Ls             : 12
 Grid : Message : 39.161275 s : * ranks          : 8
 Grid : Message : 39.161276 s : * nodes          : 1
 Grid : Message : 39.161277 s : * ranks/node     : 8
 Grid : Message : 39.161277 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 39.161278 s : * Using 1 threads
 Grid : Message : 39.161279 s : ==================================================================================
 Grid : Message : 39.911996 s : Initialised RNGs
 Grid : Message : 54.971914 s : ==================================================================================
 Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute
 Grid : Message : 54.971930 s : * SINGLE precision 
 Grid : Message : 54.971931 s : ==================================================================================
 Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 56.309462 s : Deo mflop/s =   8572660.7 (1374.9) 8483366.4-8644399.6
 Grid : Message : 56.309467 s : Deo mflop/s per rank   1071582.6
 Grid : Message : 56.309468 s : Deo mflop/s per node   8572660.7
 Grid : Message : 56.309469 s : ==================================================================================
 Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels
 Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute
 Grid : Message : 56.309473 s : * SINGLE precision 
 Grid : Message : 56.309474 s : ==================================================================================
 Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 57.640714 s : Deo mflop/s =   8200141.3 (1445.8) 8113545.6-8286307.9
 Grid : Message : 57.640717 s : Deo mflop/s per rank   1025017.7
 Grid : Message : 57.640718 s : Deo mflop/s per node   8200141.3
 Grid : Message : 57.640719 s : ==================================================================================
 Grid : Message : 57.640720 s : 24^4 x 12 Deo Best  mflop/s        =   8572660.7 ; 8572660.7 per node 
 Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s        =   8200141.3 ; 8200141.3 per node 
 Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ; 
 Grid : Message : 57.640727 s : ==================================================================================
 Grid : Message : 57.806175 s : ==================================================================================
 Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume 
 Grid : Message : 57.806191 s : * Nc             : 3
 Grid : Message : 57.806192 s : * Global volume  : 32 64 64 64 
 Grid : Message : 57.806200 s : * Ls             : 12
 Grid : Message : 57.806200 s : * ranks          : 8
 Grid : Message : 57.806200 s : * nodes          : 1
 Grid : Message : 57.806200 s : * ranks/node     : 8
 Grid : Message : 57.806200 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 57.806201 s : * Using 1 threads
 Grid : Message : 57.806201 s : ==================================================================================
 Grid : Message : 60.313153 s : Initialised RNGs
 Grid : Message : 107.830286 s : ==================================================================================
 Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute
 Grid : Message : 107.830308 s : * SINGLE precision 
 Grid : Message : 107.830309 s : ==================================================================================
 Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 111.479625 s : Deo mflop/s =   9771387.8 (1000.8) 9688589.9-9830800.0
 Grid : Message : 111.479629 s : Deo mflop/s per rank   1221423.5
 Grid : Message : 111.479630 s : Deo mflop/s per node   9771387.8
 Grid : Message : 111.479631 s : ==================================================================================
 Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels
 Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute
 Grid : Message : 111.479631 s : * SINGLE precision 
 Grid : Message : 111.479631 s : ==================================================================================
 Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0
 Grid : Message : 115.406573 s : Deo mflop/s =   8785297.3 (1739.6) 8628282.5-8911307.5
 Grid : Message : 115.406576 s : Deo mflop/s per rank   1098162.2
 Grid : Message : 115.406577 s : Deo mflop/s per node   8785297.3
 Grid : Message : 115.406578 s : ==================================================================================
 Grid : Message : 115.406578 s : 32^4 x 12 Deo Best  mflop/s        =   9771387.8 ; 9771387.8 per node 
 Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s        =   8785297.3 ; 8785297.3 per node 
 Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ; 
 Grid : Message : 115.406582 s : ==================================================================================
 Grid : Message : 115.918888 s : ==================================================================================
 Grid : Message : 115.918902 s :  Improved Staggered dslash 4D vectorised
 Grid : Message : 115.918903 s : ==================================================================================
 Grid : Message : 115.920344 s : ==================================================================================
 Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume 
 Grid : Message : 115.920347 s : * Global volume  : 8 16 16 16 
 Grid : Message : 115.920354 s : * ranks          : 8
 Grid : Message : 115.920355 s : * nodes          : 1
 Grid : Message : 115.920356 s : * ranks/node     : 8
 Grid : Message : 115.920357 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 115.920376 s : * Using 1 threads
 Grid : Message : 115.920377 s : ==================================================================================
 Grid : Message : 115.923522 s : Initialised RNGs
 Grid : Message : 116.904870 s : ==================================================================================
 Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels
 Grid : Message : 116.904960 s : * SINGLE precision 
 Grid : Message : 116.904970 s : ==================================================================================
 Grid : Message : 116.288979 s : Deo mflop/s =   49708.9 (22.9) 44075.3-50609.3
 Grid : Message : 116.289000 s : Deo mflop/s per rank   6213.6
 Grid : Message : 116.289002 s : Deo mflop/s per node   49708.9
 Grid : Message : 116.289003 s : ==================================================================================
 Grid : Message : 116.289004 s : * SINGLE precision 
 Grid : Message : 116.289005 s : ==================================================================================
 Grid : Message : 116.481632 s : Deo mflop/s =   49737.1 (13.5) 48517.0-50338.0
 Grid : Message : 116.481639 s : Deo mflop/s per rank   6217.1
 Grid : Message : 116.481640 s : Deo mflop/s per node   49737.1
 Grid : Message : 116.481641 s : ==================================================================================
 Grid : Message : 116.481642 s : 8^4  Deo Best  mflop/s        =   49737.1 ; 49737.1 per node 
 Grid : Message : 116.481644 s : 8^4  Deo Worst mflop/s        =   49708.9 ; 49708.9 per node 
 Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ; 
 Grid : Message : 116.481647 s : ==================================================================================
 Grid : Message : 116.483458 s : ==================================================================================
 Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume 
 Grid : Message : 116.483462 s : * Global volume  : 12 24 24 24 
 Grid : Message : 116.483465 s : * ranks          : 8
 Grid : Message : 116.483466 s : * nodes          : 1
 Grid : Message : 116.483466 s : * ranks/node     : 8
 Grid : Message : 116.483466 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 116.483467 s : * Using 1 threads
 Grid : Message : 116.483468 s : ==================================================================================
 Grid : Message : 116.489279 s : Initialised RNGs
 Grid : Message : 116.945016 s : ==================================================================================
 Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels
 Grid : Message : 116.945026 s : * SINGLE precision 
 Grid : Message : 116.945027 s : ==================================================================================
 Grid : Message : 117.159821 s : Deo mflop/s =   229778.4 (89.5) 223656.1-233547.5
 Grid : Message : 117.159835 s : Deo mflop/s per rank   28722.3
 Grid : Message : 117.159837 s : Deo mflop/s per node   229778.4
 Grid : Message : 117.159838 s : ==================================================================================
 Grid : Message : 117.159838 s : * SINGLE precision 
 Grid : Message : 117.159838 s : ==================================================================================
 Grid : Message : 117.371102 s : Deo mflop/s =   229516.6 (61.8) 225781.1-233547.5
 Grid : Message : 117.371109 s : Deo mflop/s per rank   28689.6
 Grid : Message : 117.371110 s : Deo mflop/s per node   229516.6
 Grid : Message : 117.371111 s : ==================================================================================
 Grid : Message : 117.371111 s : 12^4  Deo Best  mflop/s        =   229778.4 ; 229778.4 per node 
 Grid : Message : 117.371113 s : 12^4  Deo Worst mflop/s        =   229516.6 ; 229516.6 per node 
 Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ; 
 Grid : Message : 117.371116 s : ==================================================================================
 Grid : Message : 117.373669 s : ==================================================================================
 Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume 
 Grid : Message : 117.373674 s : * Global volume  : 16 32 32 32 
 Grid : Message : 117.373678 s : * ranks          : 8
 Grid : Message : 117.373679 s : * nodes          : 1
 Grid : Message : 117.373679 s : * ranks/node     : 8
 Grid : Message : 117.373679 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 117.373680 s : * Using 1 threads
 Grid : Message : 117.373681 s : ==================================================================================
 Grid : Message : 117.386495 s : Initialised RNGs
 Grid : Message : 118.755695 s : ==================================================================================
 Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels
 Grid : Message : 118.755707 s : * SINGLE precision 
 Grid : Message : 118.755708 s : ==================================================================================
 Grid : Message : 119.178990 s : Deo mflop/s =   608844.0 (126.1) 596065.5-615608.7
 Grid : Message : 119.179160 s : Deo mflop/s per rank   76105.5
 Grid : Message : 119.179180 s : Deo mflop/s per node   608844.0
 Grid : Message : 119.179190 s : ==================================================================================
 Grid : Message : 119.179200 s : * SINGLE precision 
 Grid : Message : 119.179200 s : ==================================================================================
 Grid : Message : 119.271093 s : Deo mflop/s =   605259.7 (188.7) 591372.1-614349.7
 Grid : Message : 119.271101 s : Deo mflop/s per rank   75657.5
 Grid : Message : 119.271103 s : Deo mflop/s per node   605259.7
 Grid : Message : 119.271104 s : ==================================================================================
 Grid : Message : 119.271105 s : 16^4  Deo Best  mflop/s        =   608844.0 ; 608844.0 per node 
 Grid : Message : 119.271107 s : 16^4  Deo Worst mflop/s        =   605259.7 ; 605259.7 per node 
 Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ; 
 Grid : Message : 119.271110 s : ==================================================================================
 Grid : Message : 119.275303 s : ==================================================================================
 Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume 
 Grid : Message : 119.275309 s : * Global volume  : 24 48 48 48 
 Grid : Message : 119.275315 s : * ranks          : 8
 Grid : Message : 119.275316 s : * nodes          : 1
 Grid : Message : 119.275317 s : * ranks/node     : 8
 Grid : Message : 119.275317 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 119.275318 s : * Using 1 threads
 Grid : Message : 119.275319 s : ==================================================================================
 Grid : Message : 119.328765 s : Initialised RNGs
 Grid : Message : 126.866160 s : ==================================================================================
 Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels
 Grid : Message : 126.866280 s : * SINGLE precision 
 Grid : Message : 126.866290 s : ==================================================================================
 Grid : Message : 126.604376 s : Deo mflop/s =   1641161.6 (335.5) 1619660.5-1663961.9
 Grid : Message : 126.604392 s : Deo mflop/s per rank   205145.2
 Grid : Message : 126.604394 s : Deo mflop/s per node   1641161.6
 Grid : Message : 126.604395 s : ==================================================================================
 Grid : Message : 126.604396 s : * SINGLE precision 
 Grid : Message : 126.604396 s : ==================================================================================
 Grid : Message : 127.829420 s : Deo mflop/s =   1620972.4 (344.9) 1602593.4-1644174.3
 Grid : Message : 127.829520 s : Deo mflop/s per rank   202621.6
 Grid : Message : 127.829530 s : Deo mflop/s per node   1620972.4
 Grid : Message : 127.829540 s : ==================================================================================
 Grid : Message : 127.829550 s : 24^4  Deo Best  mflop/s        =   1641161.6 ; 1641161.6 per node 
 Grid : Message : 127.829570 s : 24^4  Deo Worst mflop/s        =   1620972.4 ; 1620972.4 per node 
 Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ; 
 Grid : Message : 127.829600 s : ==================================================================================
 Grid : Message : 127.107891 s : ==================================================================================
 Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume 
 Grid : Message : 127.107904 s : * Global volume  : 32 64 64 64 
 Grid : Message : 127.107912 s : * ranks          : 8
 Grid : Message : 127.107913 s : * nodes          : 1
 Grid : Message : 127.107914 s : * ranks/node     : 8
 Grid : Message : 127.107914 s : * ranks geom     : 1 2 2 2 
 Grid : Message : 127.107915 s : * Using 1 threads
 Grid : Message : 127.107916 s : ==================================================================================
 Grid : Message : 127.257116 s : Initialised RNGs
 Grid : Message : 148.527930 s : ==================================================================================
 Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels
 Grid : Message : 148.527942 s : * SINGLE precision 
 Grid : Message : 148.527943 s : ==================================================================================
 Grid : Message : 149.401625 s : Deo mflop/s =   3085543.7 (956.0) 2934476.4-3115147.4
 Grid : Message : 149.401643 s : Deo mflop/s per rank   385693.0
 Grid : Message : 149.401645 s : Deo mflop/s per node   3085543.7
 Grid : Message : 149.401646 s : ==================================================================================
 Grid : Message : 149.401647 s : * SINGLE precision 
 Grid : Message : 149.401648 s : ==================================================================================
 Grid : Message : 150.204533 s : Deo mflop/s =   3053468.5 (343.9) 3030688.8-3077255.0
 Grid : Message : 150.204540 s : Deo mflop/s per rank   381683.6
 Grid : Message : 150.204541 s : Deo mflop/s per node   3053468.5
 Grid : Message : 150.204542 s : ==================================================================================
 Grid : Message : 150.204543 s : 32^4  Deo Best  mflop/s        =   3085543.7 ; 3085543.7 per node 
 Grid : Message : 150.204545 s : 32^4  Deo Worst mflop/s        =   3053468.5 ; 3053468.5 per node 
 Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ; 
 Grid : Message : 150.204548 s : ==================================================================================
 Grid : Message : 150.292848 s : ==================================================================================
 Grid : Message : 150.292864 s :  Summary table Ls=12
 Grid : Message : 150.292866 s : ==================================================================================
 Grid : Message : 150.292866 s : L 		 Clover 		 DWF4 		 Staggered
 Grid : Message : 150.292867 s : 8 		 154914.0 		 1386335.8 		 49737.1
 Grid : Message : 150.292880 s : 12 		 693556.6 		 4208495.6 		 229778.4
 Grid : Message : 150.292882 s : 16 		 1840587.3 		 6674673.6 		 608844.0
 Grid : Message : 150.292884 s : 24 		 3933599.5 		 8572660.7 		 1641161.6
 Grid : Message : 150.292886 s : 32 		 5082758.0 		 9771387.8 		 3085543.7
 Grid : Message : 150.292888 s : ==================================================================================
 Grid : Message : 150.292888 s : ==================================================================================
 Grid : Message : 150.292888 s :  Memory benchmark 
 Grid : Message : 150.292888 s : ==================================================================================
 Grid : Message : 150.295495 s : ==================================================================================
 Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth
 Grid : Message : 150.295498 s : ==================================================================================
 Grid : Message : 150.295499 s :   L  		bytes			GB/s		Gflop/s		 seconds		GB/s / node
 Grid : Message : 150.295500 s : ----------------------------------------------------------
 Grid : Message : 160.682233 s : 8		6291456.000   		379.297		31.608		10.367		379.297
 Grid : Message : 161.851979 s : 16		100663296.000   		3754.675		312.890		1.047		3754.675
 Grid : Message : 162.458098 s : 24		509607936.000   		6521.472		543.456		0.603		6521.472
 Grid : Message : 162.924116 s : 32		1610612736.000   		8513.456		709.455		0.462		8513.456
 Grid : Message : 163.363877 s : 40		3932160000.000   		9018.902		751.575		0.436		9018.902
 Grid : Message : 163.363976 s : ==================================================================================
 Grid : Message : 163.363978 s :  Batched BLAS benchmark 
 Grid : Message : 163.363979 s : ==================================================================================
 hipblasCreate
 Grid : Message : 163.364046 s : ==================================================================================
 Grid : Message : 163.364048 s : = batched GEMM (double precision) 
 Grid : Message : 163.364048 s : ==================================================================================
 Grid : Message : 163.364048 s :   M  		N			K		Gflop/s / rank (coarse mrhs)
 Grid : Message : 163.364049 s : ----------------------------------------------------------
 Grid : Message : 163.438476 s : 16		8		16		256		0.565
 Grid : Message : 163.438944 s : 16		16		16		256		243.148
 Grid : Message : 163.439501 s : 16		32		16		256		440.347
 Grid : Message : 163.440003 s : 32		8		32		256		439.194
 Grid : Message : 163.440463 s : 32		16		32		256		847.334
 Grid : Message : 163.441051 s : 32		32		32		256		1430.893
 Grid : Message : 163.441679 s : 64		8		64		256		1242.757
 Grid : Message : 163.442354 s : 64		16		64		256		2196.689
 Grid : Message : 163.443196 s : 64		32		64		256		3697.458
 Grid : Message : 163.443200 s : ----------------------------------------------------------
 Grid : Message : 163.443201 s :   M  		N			K		Gflop/s / rank (block project)
 Grid : Message : 163.443202 s : ----------------------------------------------------------
 Grid : Message : 163.444013 s : 16		8		256		256		899.583
 Grid : Message : 163.444933 s : 16		16		256		256		1673.538
 Grid : Message : 163.446013 s : 16		32		256		256		2959.597
 Grid : Message : 163.446951 s : 32		8		256		256		1558.859
 Grid : Message : 163.447970 s : 32		16		256		256		2864.839
 Grid : Message : 163.449240 s : 32		32		256		256		4810.671
 Grid : Message : 163.450524 s : 64		8		256		256		2386.093
 Grid : Message : 163.451877 s : 64		16		256		256		4451.666
 Grid : Message : 163.453806 s : 64		32		256		256		5942.124
 Grid : Message : 163.453809 s : ----------------------------------------------------------
 Grid : Message : 163.453810 s :   M  		N			K		Gflop/s / rank (block promote)
 Grid : Message : 163.453811 s : ----------------------------------------------------------
 Grid : Message : 163.454716 s : 8		256		16		256		799.867
 Grid : Message : 163.455690 s : 16		256		16		256		1584.625
 Grid : Message : 163.457209 s : 32		256		16		256		1949.422
 Grid : Message : 163.458254 s : 8		256		32		256		1389.417
 Grid : Message : 163.459339 s : 16		256		32		256		2668.344
 Grid : Message : 163.461158 s : 32		256		32		256		3234.162
 Grid : Message : 163.462566 s : 8		256		64		256		2150.925
 Grid : Message : 163.464066 s : 16		256		64		256		4012.488
 Grid : Message : 163.466272 s : 32		256		64		256		5154.786
 Grid : Message : 163.466276 s : ==================================================================================
 Grid : Message : 163.466277 s : ==================================================================================
 Grid : Message : 163.466278 s :  Communications benchmark 
 Grid : Message : 163.466279 s : ==================================================================================
 Grid : Message : 163.466280 s : ====================================================================================================
 Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions
 Grid : Message : 163.466281 s : ====================================================================================================
 Grid : Message : 163.466281 s :  L  	 Ls  	bytes	 MB/s uni  		 MB/s bidi 
 Grid : Message : 163.521339 s : 16	12	 4718592 	 122513.099		245026.198
 Grid : Message : 163.551417 s : 16	12	 4718592 	 125590.498		251180.996
 Grid : Message : 163.572339 s : 16	12	 4718592 	 180555.489		361110.977
 Grid : Message : 163.602810 s : 16	12	 4718592 	 123949.223		247898.447
 Grid : Message : 163.633041 s : 16	12	 4718592 	 124933.761		249867.523
 Grid : Message : 163.654084 s : 16	12	 4718592 	 179516.530		359033.061
 Grid : Message : 163.756280 s : 24	12	 15925248 	 127515.473		255030.946
 Grid : Message : 163.852651 s : 24	12	 15925248 	 132226.945		264453.890
 Grid : Message : 163.917510 s : 24	12	 15925248 	 196474.591		392949.183
 Grid : Message : 164.170390 s : 24	12	 15925248 	 128020.322		256040.644
 Grid : Message : 164.113321 s : 24	12	 15925248 	 132340.948		264681.896
 Grid : Message : 164.178314 s : 24	12	 15925248 	 196051.311		392102.622
 Grid : Message : 164.413983 s : 32	12	 37748736 	 129411.666		258823.333
 Grid : Message : 164.639218 s : 32	12	 37748736 	 134090.789		268181.577
 Grid : Message : 164.789675 s : 32	12	 37748736 	 200739.096		401478.191
 Grid : Message : 165.228910 s : 32	12	 37748736 	 129497.681		258995.363
 Grid : Message : 165.248096 s : 32	12	 37748736 	 134103.293		268206.586
 Grid : Message : 165.398958 s : 32	12	 37748736 	 200198.805		400397.611
 Grid : Message : 165.399411 s : ==================================================================================
 Grid : Message : 165.399413 s :  Per Node Summary table Ls=12
 Grid : Message : 165.399414 s : ==================================================================================
 Grid : Message : 165.399414 s :  L 		 Clover		 DWF4		 Staggered (GF/s per node)
 Grid : Message : 165.399417 s : 8 		 154914.003 	 1386335.817 	 49737.127
 Grid : Message : 165.399423 s : 12 		 693556.579 	 4208495.611 	 229778.435
 Grid : Message : 165.399426 s : 16 		 1840587.280 	 6674673.647 	 608844.000
 Grid : Message : 165.399429 s : 24 		 3933599.545 	 8572660.656 	 1641161.613
 Grid : Message : 165.399432 s : 32 		 5082757.996 	 9771387.820 	 3085543.742
 Grid : Message : 165.399435 s : ==================================================================================
 Grid : Message : 165.399435 s : ==================================================================================
 Grid : Message : 165.399435 s :  Comparison point     result: 9172024.238 Mflop/s per node
 Grid : Message : 165.399436 s :  Comparison point is 0.5*(9771387.820+8572660.656) 
 Grid : Message : 165.399438 s : ==================================================================================
 Grid : Message : 165.399438 s : *******************************************
 Grid : Message : 165.399438 s : ******* Grid Finalize                ******
 Grid : Message : 165.399438 s : *******************************************
--- a/systems/Frontier/benchmarks/benchusqcd.slurm
+++ b/systems/Frontier/benchmarks/benchusqcd.slurm
@@ -0,0 +1,38 @@
 #!/bin/bash -l
 #SBATCH --job-name=bench
 ##SBATCH --partition=small-g
 ##SBATCH -q debug
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=7
 #SBATCH --gpus-per-node=8
 #SBATCH --time=00:30:00
 #SBATCH --account=phy157_dwf
 #SBATCH --gpu-bind=none
 #SBATCH --exclusive
 #SBATCH --mem=0
 cat << EOF > select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3 7 6 5 4)
 export NUMA_MAP=(3 3 1 1 2 2 0 0)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 export HIP_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./select_gpu
 root=$HOME/Frontier/Grid/systems/Frontier/
 source ${root}/sourceme.sh
 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol  > Benchmark_usqcd.log
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@@ -3,7 +3,7 @@ spack load c-lime
 module load emacs 
 module load PrgEnv-gnu
 module load rocm
-module load cray-mpich/8.1.23
+module load cray-mpich
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
--- a/systems/Tursa/config-command
+++ b/systems/Tursa/config-command
@@ -2,11 +2,11 @@
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
    --enable-gen-simd-width=64 \
    --disable-gparity \
    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
-    LDFLAGS="-cudart shared " \
+    LDFLAGS="-cudart shared -lcublas " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611"
--- a/systems/Tursa/sourceme.sh
+++ b/systems/Tursa/sourceme.sh
@@ -1,6 +1,7 @@
-module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1  
+module load cuda/12.3 
-#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
+module load ucx/1.15.0-cuda12.3  
-export PREFIX=/home/tc002/tc002/shared/env/prefix/
+module load openmpi/4.1.5-cuda12.3
-export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
+source /home/dp207/dp207/shared/env/production/env-base.sh 
 source /home/dp207/dp207/shared/env/production/env-gpu.sh 
 unset SBATCH_EXPORT
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -142,7 +142,9 @@ int main (int argc, char ** argv)
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
  time_t now;
  time_t start = time(NULL);
  UGrid->Broadcast(0,(void *)&start,sizeof(start));
  FlightRecorder::ContinueOnFail = 0;
  FlightRecorder::PrintEntireLog = 0;
@@ -162,9 +164,9 @@ int main (int argc, char ** argv)
    }
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
-    t1=usecond();
+    t1=usecond(); 
    mCG(src_o,result_o);
-    t2=usecond();
+    t2=usecond(); 
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
@@ -176,7 +178,8 @@ int main (int argc, char ** argv)
    std::cout << " FlightRecorder is OK! "<<std::endl;
    iter ++;
-  } while (time(NULL) < (start + nsecs/10) );
+    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
  } while (now < (start + nsecs/10) );
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
@@ -189,7 +192,7 @@ int main (int argc, char ** argv)
    }
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
-    t1=usecond();
+    t1=usecond(); 
    CG(HermOpEO,src_o,result_o_2);
    t2=usecond();
    iters = CG.IterationsToComplete;
@@ -201,8 +204,9 @@ int main (int argc, char ** argv)
    std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
    assert(FlightRecorder::ErrorCount()==0);
    std::cout << " FlightRecorder is OK! "<<std::endl;
    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
    i++;
-  } while (time(NULL) < (start + nsecs) );
+  } while (now < (start + nsecs) );
  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
--- a/tests/debug/Test_8888.cc
+++ b/tests/debug/Test_8888.cc
@@ -0,0 +1,118 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_general_coarse_hdcg.cc
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
 using namespace std;
 using namespace Grid;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=8;
  const int nbasis = 40;
  const int cb = 0 ;
  RealD mass=0.01;
  RealD M5=1.8;
  RealD b=1.0;
  RealD c=0.0;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  ///////////////////////// RNGs /////////////////////////////////
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  ///////////////////////// Configuration /////////////////////////////////
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_EODWF_lat.125");
  NerscIO::readConfiguration(Umu,header,file);
  //////////////////////// Fermion action //////////////////////////////////
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
  MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf);
  std::cout << "**************************************"<<std::endl;
  std::cout << "         Fine Power method            "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  LatticeFermionD pm_src(FGrid);
  pm_src = ComplexD(1.0);
  PowerMethod<LatticeFermionD>       fPM;
  fPM(HermOp,pm_src);
  std::cout << "**************************************"<<std::endl;
  std::cout << "         Fine Lanczos  (poly, low)    "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  int Nk=80;
  int Nm=Nk*3;
  int Nstop=8;
  int Nconv_test_interval=1;
  //  Chebyshev<LatticeFermionD>      IRLChebyLo(0.2,64.0,201);  // 1 iter
  Chebyshev<LatticeFermionD>      IRLChebyLo(0.0,55.0,101);  // 1 iter
  FunctionHermOp<LatticeFermionD>    PolyOp(IRLChebyLo,HermOp);
  PlainHermOp<LatticeFermionD>          Op(HermOp);
  ImplicitlyRestartedLanczos IRL(PolyOp,
 				 Op,
 				 Nk, // sought vecs
 				 Nk, // sought vecs
 				 Nm, // spare vecs
 				 1.0e-8,
 				 10 // Max iterations
 				 );
  int Nconv;
  std::vector<RealD>            eval(Nm);
  std::vector<LatticeFermionD>     evec(Nm,FGrid);
  LatticeFermionD     irl_src(FGrid);
  IRL.calc(eval,evec,irl_src,Nconv);
  Grid_finalize();
  return 0;
 }
--- a/tests/debug/Test_cayley_cg.cc
+++ b/tests/debug/Test_cayley_cg.cc
@@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf,
 		   GridParallelRNG *RNG5)
 {
  LatticeFermion src   (FGrid); random(*RNG5,src);
-  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion result1(FGrid); result1=Zero();
  LatticeFermion result2(FGrid); result2=Zero();
  LatticeFermion result3(FGrid); result3=Zero();
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
-  SchurSolver(Ddwf,src,result);
+  SchurSolver(Ddwf,src,result1);
  SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
  SchurSolverSymm1(Ddwf,src,result2);
  SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
  SchurSolverSymm2(Ddwf,src,result3);
  std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
  std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl; 
  result2=result2-result1;
  std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl; 
  std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl; 
  result3=result3-result1;
  std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl; 
 }
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@@ -244,7 +244,7 @@ int main (int argc, char ** argv)
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
-  
+#if 0  
  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
@@ -307,7 +307,8 @@ int main (int argc, char ** argv)
    rh_res= Zero();
    mrhsCG(MrhsCoarseOp,rh_src,rh_res);
  }
-  
+
 #endif
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
-#include <Grid/lattice/PaddedCell.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
-//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 using namespace std;
 using namespace Grid;
 template<class Coarsened>
 void SaveOperator(Coarsened &Operator,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Operator.Grid()->IsBoss());
  assert(Operator._A.size()==Operator.geom.npoint);
  WR.open(file);
  for(int p=0;p<Operator._A.size();p++){
    auto tmp = Operator.Cell.Extract(Operator._A[p]);
    WR.writeScidacFieldRecord(tmp,record);
  }
  WR.close();
 #endif
 }
 template<class Coarsened>
 void LoadOperator(Coarsened &Operator,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  Grid::ScidacReader RD ;
  RD.open(file);
  assert(Operator._A.size()==Operator.geom.npoint);
  for(int p=0;p<Operator.geom.npoint;p++){
    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
    RD.readScidacFieldRecord(Operator._A[p],record);
  }    
  RD.close();
  Operator.ExchangeCoarseLinks();
 #endif
 }
 template<class aggregation>
 void SaveBasis(aggregation &Agg,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Agg.FineGrid->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    WR.writeScidacFieldRecord(Agg.subspace[b],record);
  }
  WR.close();
 #endif
 }
 template<class aggregation>
 void LoadBasis(aggregation &Agg, std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    RD.readScidacFieldRecord(Agg.subspace[b],record);
  }    
  RD.close();
 #endif
 }
 template<class Field> class TestSolver : public LinearFunction<Field> {
 public:
  TestSolver() {};
  void operator() (const Field &in, Field &out){    out = Zero();  }     
 };
 RealD InverseApproximation(RealD x){
  return 1.0/x;
 }
 // Want Op in CoarsenOp to call MatPcDagMatPc
 template<class Field>
 class HermOpAdaptor : public LinearOperatorBase<Field>
@@ -119,33 +48,37 @@ public:
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 };
-template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+
 template<class Field> class CGSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
-  Chebyshev<Field> Cheby;
+  int iters;
-  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
-    Cheby(_lo,_hi,_ord,InverseApproximation)
+    iters(_iters)
  {
-    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
-    Field tmp(in.Grid());
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
-    tmp = in;
+
-    Cheby(_SmootherOperator,tmp,out);
+    out=Zero();
    CG(_SmootherOperator,in,out);
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=24;
-  const int nbasis = 40;
+  const int nbasis = 60;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
@@ -160,10 +93,12 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid with 4^4 cell
  Coordinate Block({4,4,4,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/4;
+    clatt[d] = clatt[d]/Block[d];
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 							    GridDefaultMpi());;
@@ -182,7 +117,7 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
-  std::string file("ckpoint_lat.4000");
+  std::string file("ckpoint_EODWF_lat.125");
  NerscIO::readConfiguration(Umu,header,file);
  //////////////////////// Fermion action //////////////////////////////////
@@ -192,15 +127,7 @@ int main (int argc, char ** argv)
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
  LatticeFermion result(FrbGrid); result=Zero();
  LatticeFermion    src(FrbGrid); random(RNG5,src);
  // Run power method on FineHermOp
  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
@@ -208,219 +135,170 @@ int main (int argc, char ** argv)
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
-  NearestStencilGeometry5D geom_nn(Coarse5d);
+
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
-  bool load=false;
+  int refine=1;
-  if ( load ) {
+    //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
-    LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
+    //    					0.0003,1.0e-5,2000); // Lo, tol, maxit
-    LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
-  } else {
+  std::cout << "**************************************"<<std::endl;
-    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+  std::cout << "Create Subspace"<<std::endl;
-				       95.0,0.1,
+  std::cout << "**************************************"<<std::endl;
-				       //				     400,200,200 -- 48 iters
+  Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
 				       //				     600,200,200 -- 38 iters, 162s
 				       //				     600,200,100 -- 38 iters, 169s
 				       //				     600,200,50  -- 88 iters. 370s 
 				       800,
 				       200,
 				       100,
 				       0.0);
    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
    SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
    SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
  }
  // Try projecting to one hop only
  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
-  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  std::cout << "**************************************"<<std::endl;
-  HermMatrix CoarseOp     (LittleDiracOp);
+  std::cout << "Refine Subspace"<<std::endl;
-  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  std::cout << "**************************************"<<std::endl;
  Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
-  //////////////////////////////////////////
+  std::cout << "**************************************"<<std::endl;
-  // Build a coarse lanczos
+  std::cout << "Coarsen after refine"<<std::endl;
-  //////////////////////////////////////////
+  std::cout << "**************************************"<<std::endl;
-  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
+  Aggregates.Orthogonalise();
-  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
+
-  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
+  std::cout << "**************************************"<<std::endl;
-  int Nk=48;
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
-  int Nm=64;
+  std::cout << "**************************************"<<std::endl;
  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
  const int nrhs=12;
  Coordinate mpi=GridDefaultMpi();
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
  std::cout << "**************************************"<<std::endl;
  std::cout << "         Coarse Lanczos               "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
  Chebyshev<CoarseVector>      IRLCheby(0.01,42.0,301);  // 1 iter
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
  CoarseVector pm_src(CoarseMrhs);
  pm_src = ComplexD(1.0);
  PowerMethod<CoarseVector>       cPM;
  cPM(MrhsCoarseOp,pm_src);
  int Nk=192;
  int Nm=384;
  int Nstop=Nk;
-  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
+  int Nconv_test_interval=1;
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
 							  Coarse5d,
 							  CoarseMrhs,
 							  nrhs,
 							  IRLCheby,
 							  Nstop,
 							  Nconv_test_interval,
 							  nrhs,
 							  Nk,
 							  Nm,
 							  1e-5,10);
  int Nconv;
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
-  CoarseVector c_src(Coarse5d);
+  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
  //c_src=1.0;
  random(CRNG,c_src);
  CoarseVector c_res(Coarse5d); 
  CoarseVector c_ref(Coarse5d); 
  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
  IRL.calc(eval,evec,c_src,Nconv);
  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
  //////////////////////////////////////////
-  // Build a coarse space solver
+  // Block projector for coarse/fine
  //////////////////////////////////////////
  int maxit=20000;
  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false);
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
-  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
+  std::cout << "**************************************"<<std::endl;
-  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  std::cout << "Calling mRHS HDCG"<<std::endl;
-  c_res=Zero();
+  std::cout << "**************************************"<<std::endl;
-  HPDSolve(c_src,c_res); c_ref = c_res;
+  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
-  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
+  MrhsProjector.ImportBasis(Aggregates.subspace);
  //////////////////////////////////////////////////////////////////////////
  // Deflated (with real op EV's) solve for the projected coarse op
  // Work towards ADEF1 in the coarse space
  //////////////////////////////////////////////////////////////////////////
  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
  c_res=Zero();
  HPDSolveProj(c_src,c_res);
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
  c_res = c_res - c_ref;
  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
-  //////////////////////////////////////////////////////////////////////
+  std::cout << "**************************************"<<std::endl;
-  // Coarse ADEF1 with deflation space
+  std::cout << " Recompute coarse evecs  "<<std::endl;
-  //////////////////////////////////////////////////////////////////////
+  std::cout << "**************************************"<<std::endl;
-  ChebyshevSmoother<CoarseVector,HermMatrix >
+  evec.resize(Nm,Coarse5d);
-    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+  eval.resize(Nm);
-    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
+  for(int r=0;r<nrhs;r++){
-  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
+    random(CRNG,c_src[r]);
  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
  ////////////////////////////////////////////////////////
  // CG, Cheby mode spacing 200,200
  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
  ////////////////////////////////////////////////////////
  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
  ////////////////////////////////////////////////////////
  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
  // HDCG 38 iters 162s
  //
  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
  // HDCG 38 iters 169s
  TwoLevelADEF1defl<CoarseVector>
    cADEF1(1.0e-8, 500,
 	   CoarseOp,
 	   CoarseSmoother,
 	   evec,eval);
  c_res=Zero();
  cADEF1(c_src,c_res);
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
  c_res = c_res - c_ref;
  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  //  cADEF1.Tolerance = 4.0e-2;
  //  cADEF1.Tolerance = 1.0e-1;
  cADEF1.Tolerance = 5.0e-2;
  c_res=Zero();
  cADEF1(c_src,c_res);
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
  c_res = c_res - c_ref;
  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  //////////////////////////////////////////
  // Build a smoother
  //////////////////////////////////////////
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
  //
  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
  // Need to measure cost of coarse space.
  //
  // -- i) Reduce coarse residual   -- 0.04
  // -- ii) Lanczos on coarse space -- done
  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
  //
  std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
  for(int l=0;l<los.size();l++){
    RealD lo = los[l];
    for(int o=0;o<ords.size();o++){
      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
      ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311
      //////////////////////////////////////////
      // Build a HDCG solver
      //////////////////////////////////////////
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
 	HDCG(1.0e-8, 100,
 	     FineHermOp,
 	     Smoother,
 	     HPDSolveSloppy,
 	     HPDSolve,
 	     Aggregates);
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
 	HDCGdefl(1.0e-8, 100,
 		 FineHermOp,
 		 Smoother,
 		 cADEF1,
 		 HPDSolve,
 		 Aggregates);
      result=Zero();
      HDCGdefl(src,result);
      result=Zero();
      HDCG(src,result);
    }
  }
-  // Standard CG
+  IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
-  result=Zero();
+
-  CGfine(HermOpEO, src, result);
+  ///////////////////////
  // Deflation guesser object
  ///////////////////////
  std::cout << "**************************************"<<std::endl;
  std::cout << " Reimport coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  MrhsGuesser.ImportEigenBasis(evec,eval);
  //////////////////////////
  // Extra HDCG parameters
  //////////////////////////
  int maxit=3000;
  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
  RealD lo=2.0;
  int ord = 7;
  DoNothingGuesser<CoarseVector> DoNothing;
  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
  /////////////////////////////////////////////////
  // Mirs smoother
  /////////////////////////////////////////////////
  RealD MirsShift = lo;
  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
    HDCGmrhs(1.0e-8, 500,
 	     FineHermOp,
 	     CGsmooth,
 	     HPDSolveMrhs,    // Used in M1
 	     HPDSolveMrhs,          // Used in Vstart
 	     MrhsProjector,
 	     MrhsGuesser,
 	     CoarseMrhs);
  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
  for(int r=0;r<nrhs;r++){
    random(RNG5,src_mrhs[r]);
    res_mrhs[r]=Zero();
  }
  HDCGmrhs(src_mrhs,res_mrhs);
  // Standard CG
 #if 1
  {
  std::cout << "**************************************"<<std::endl;
  std::cout << "Calling red black CG"<<std::endl;
  std::cout << "**************************************"<<std::endl;
    LatticeFermion result(FrbGrid); result=Zero();
    LatticeFermion    src(FrbGrid); random(RNG5,src);
    result=Zero();
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
    CGfine(HermOpEO, src, result);
  }
 #endif  
  Grid_finalize();
  return 0;
 }
--- a/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
@@ -145,7 +145,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  const int Ls=24;
-  const int nbasis = 60;
+  const int nbasis = 62;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
@@ -160,7 +160,7 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid with 4^4 cell
-  Coordinate Block({4,4,4,4});
+  Coordinate Block({4,4,6,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/Block[d];
--- a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
@@ -0,0 +1,396 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_general_coarse_hdcg.cc
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
 using namespace std;
 using namespace Grid;
 template<class aggregation>
 void SaveBasis(aggregation &Agg,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(Agg.FineGrid->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
  }
  WR.close();
 #endif
 }
 template<class aggregation>
 void LoadBasis(aggregation &Agg, std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
  }    
  RD.close();
 #endif
 }
 template<class CoarseVector>
 void SaveEigenvectors(std::vector<RealD>            &eval,
 		      std::vector<CoarseVector>     &evec,
 		      std::string evec_file,
 		      std::string eval_file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
  ScidacWriter WR(evec[0].Grid()->IsBoss());
  WR.open(evec_file);
  for(int b=0;b<evec.size();b++){
    WR.writeScidacFieldRecord(evec[b],record,0,0);
  }
  WR.close();
  XmlWriter WRx(eval_file);
  write(WRx,"evals",eval);
 #endif
 }
 template<class CoarseVector>
 void LoadEigenvectors(std::vector<RealD>            &eval,
 		      std::vector<CoarseVector>     &evec,
 		      std::string evec_file,
 		      std::string eval_file)
 {
 #ifdef HAVE_LIME
    XmlReader RDx(eval_file);
    read(RDx,"evals",eval);
    emptyUserRecord record;
    Grid::ScidacReader RD ;
    RD.open(evec_file);
    assert(evec.size()==eval.size());
    for(int k=0;k<eval.size();k++) {
      RD.readScidacFieldRecord(evec[k],record);
    }
    RD.close();
 #endif
 }
 // Want Op in CoarsenOp to call MatPcDagMatPc
 template<class Field>
 class HermOpAdaptor : public LinearOperatorBase<Field>
 {
  LinearOperatorBase<Field> & wrapped;
 public:
  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 };
 template<class Field> class CGSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator   & _SmootherOperator;
  int iters;
  CGSmoother(int _iters, FineOperator &SmootherOperator) :
    _SmootherOperator(SmootherOperator),
    iters(_iters)
  {
    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
  };
  void operator() (const Field &in, Field &out) 
  {
    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
    out=Zero();
    CG(_SmootherOperator,in,out);
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=24;
  const int nbasis = 60;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
  RealD b=1.5;
  RealD c=0.5;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid with 4^4 cell
  //  Coordinate Block({4,4,6,4});
  Coordinate Block({4,4,4,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/Block[d];
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 							    GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  ///////////////////////// RNGs /////////////////////////////////
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  ///////////////////////// Configuration /////////////////////////////////
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250");
  NerscIO::readConfiguration(Umu,header,file);
  //////////////////////// Fermion action //////////////////////////////////
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60");
  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60");
  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2");
  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60");
  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
  bool load_agg=true;
  bool load_refine=true;
  bool load_mat=false;
  bool load_evec=false;
  int refine=1;
  if ( load_agg ) {
    if ( !(refine) || (!load_refine) ) { 
      LoadBasis(Aggregates,subspace_file);
    }
  } else {
    Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
    SaveBasis(Aggregates,subspace_file);
  }
  if ( load_refine ) {
    std::cout << " Load Refine "<< refine_file <<std::endl;
    LoadBasis(Aggregates,refine_file);
  } else {
    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters
    //    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters
    SaveBasis(Aggregates,refine_file);
  }
  Aggregates.Orthogonalise();
  std::cout << "**************************************"<<std::endl;
  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  const int nrhs=12;
  Coordinate mpi=GridDefaultMpi();
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
  ///////////////////////
  // Deflation guesser object
  ///////////////////////
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  //////////////////////////////////////////
  // Block projector for coarse/fine
  //////////////////////////////////////////
  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
  std::cout << "**************************************"<<std::endl;
  std::cout << "Coarsen after refine"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
  std::cout << "**************************************"<<std::endl;
  std::cout << "         Coarse Lanczos               "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
  //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 4.4.6.4
  //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,501);  // for 4.4.4.4 blocking 350 evs
  //  Chebyshev<CoarseVector>      IRLCheby(0.0014,42.0,501);  // for 4.4.4.4 blocking 700 evs
  //  Chebyshev<CoarseVector>      IRLCheby(0.002,42.0,501);  // for 4.4.4.4 blocking 1226 evs
  //  Chebyshev<CoarseVector>      IRLCheby(0.0025,42.0,501);  // for 4.4.4.4 blocking 1059 evs
 							  //							  3e-4,2);
  Chebyshev<CoarseVector>      IRLCheby(0.0018,42.0,301);  // for 4.4.4.4 blocking  // 790 evs
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
  CoarseVector pm_src(CoarseMrhs);
  pm_src = ComplexD(1.0);
  PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src);
  //  int Nk=nrhs*30; // 4.4.6.4
  //  int Nk=nrhs*80;
  int Nk=nrhs*60; // 720
  int Nm=Nk*4;    // 2880 ; generally finishes at 1440
  int Nstop=512;
  int Nconv_test_interval=1;
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
 							  Coarse5d,
 							  CoarseMrhs,
 							  nrhs,
 							  IRLCheby,
 							  Nstop,
 							  Nconv_test_interval,
 							  nrhs,
 							  Nk,
 							  Nm,
 							  3e-4,2);
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
  std::cout << "**************************************"<<std::endl;
  std::cout << " Recompute coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  evec.resize(Nm,Coarse5d);
  eval.resize(Nm);
  for(int r=0;r<nrhs;r++){
    random(CRNG,c_src[r]);
  }
  int Nconv;
  IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl);
  Nconv = eval.size();
  std::cout << "**************************************"<<std::endl;
  std::cout << " import coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MrhsGuesser.ImportEigenBasis(evec,eval);
  std::cout << "**************************************"<<std::endl;
  std::cout << "Calling mRHS HDCG"<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
  MrhsProjector.ImportBasis(Aggregates.subspace);
  //////////////////////////
  // Extra HDCG parameters
  //////////////////////////
  int maxit=3000;
  ConjugateGradient<CoarseVector>  CG(7.5e-2,maxit,false);
  RealD lo=2.0;
  int ord = 7;
  DoNothingGuesser<CoarseVector> DoNothing;
  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
  HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing);
  /////////////////////////////////////////////////
  // Mirs smoother
  /////////////////////////////////////////////////
  RealD MirsShift = lo;
  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
    HDCGmrhs(1.0e-8, 500,
 	     FineHermOp,
 	     CGsmooth,
 	     HPDSolveMrhs,    // Used in M1
 	     HPDSolveMrhs,          // Used in Vstart
 	     MrhsProjector,
 	     MrhsGuesser,
 	     CoarseMrhs);
  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
  for(int r=0;r<nrhs;r++){
    random(RNG5,src_mrhs[r]);
    res_mrhs[r]=Zero();
  }
  HDCGmrhs(src_mrhs,res_mrhs);
  // Standard CG
 #if 0
  {
  std::cout << "**************************************"<<std::endl;
  std::cout << "Calling red black CG"<<std::endl;
  std::cout << "**************************************"<<std::endl;
    LatticeFermion result(FrbGrid); result=Zero();
    LatticeFermion    src(FrbGrid); random(RNG5,src);
    result=Zero();
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
    CGfine(HermOpEO, src, result);
  }
 #endif  
  Grid_finalize();
  return 0;
 }
Author	SHA1	Message	Date
Peter Boyle	bffd30abec	Optimise lie algebra project	2024-09-19 15:48:09 -04:00
Peter Boyle	da919949f9	Clean up the accelerator pick/set checkerboard	2024-08-23 12:34:41 -04:00
Peter Boyle	b12b4fdaff	Attempt at operating on half checkerboard	2024-08-23 11:05:09 -04:00
Peter Boyle	557fa483ff	Blas benchmark committed stand alone	2024-08-20 16:18:43 +00:00
Peter Boyle	fc15d55df6	Mallinfo	2024-08-20 14:33:09 +00:00
Peter Boyle	53573d7d94	Better benchmark	2024-08-20 14:31:57 +00:00
Peter Boyle	bb3c177000	Better benchmarking	2024-08-20 14:31:41 +00:00
Peter Boyle	a3322b470f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-08-20 14:30:52 +00:00
Peter Boyle	f8f408e7a9	BLAS everywhere	2024-07-25 18:09:02 +00:00
Peter Boyle	baac1127d0	Later intel compiler happiness	2024-07-25 18:06:05 +00:00
Peter Boyle	6f1328160c	Remove SVM use	2024-07-25 18:05:40 +00:00
Peter Boyle	04cf902791	Mallinfo and ASAN hooks	2024-07-25 18:04:56 +00:00
Peter Boyle	7a5b1c1a19	Try Catch convenience macro	2024-07-25 18:03:41 +00:00
Peter Boyle	18d2d7da4a	Eigen implementation and SYCL implementation	2024-07-25 18:02:56 +00:00
Peter Boyle	b461184797	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-07-23 09:53:58 -04:00
Peter Boyle	4563b39305	New Frontier config	2024-07-23 09:53:08 -04:00
Peter Boyle	c9d5674d5b	FInal for paper	2024-07-22 15:26:45 -04:00
Peter Boyle	486412635a	8^4 test for PETSc	2024-07-22 15:25:17 -04:00
Peter Boyle	8b23a1546a	Force compile temporarily	2024-07-22 15:24:56 -04:00
Peter Boyle	a901e4e369	Regressed performance for paper	2024-07-22 15:24:04 -04:00
Peter Boyle	804d9367d4	Regressed performance	2024-07-22 15:23:25 -04:00
Peter Boyle	41d8adca95	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-07-11 15:38:45 +00:00
Peter Boyle	059e8e5bb0	New compile option	2024-07-11 15:37:30 +00:00
Peter Boyle	b3ee8ded96	Respect command line	2024-07-11 15:34:48 +00:00
Peter Boyle	cf3584ad15	Convenient to monitor memory across an HMC trajectory	2024-07-11 15:30:32 +00:00
Peter Boyle	a66973163f	Device vector not UVM	2024-07-11 15:24:11 +00:00
Peter Boyle	4502a8c8a1	libc malloc heap info dump on Linux	2024-07-11 15:22:18 +00:00
Peter Boyle	9c902e4c2d	Batched blas, but not working yet on OneAPI	2024-07-11 15:19:49 +00:00
Peter Boyle	f3eb36adcf	Namespace addition	2024-07-11 15:19:19 +00:00
Peter Boyle	7c246606c1	Schur additional case	2024-07-10 22:04:32 +00:00
Peter Boyle	172c75029e	Redblack additional case	2024-07-10 22:03:59 +00:00
Peter Boyle	6ae52da571	LLVM leak sanitizer	2024-07-08 15:59:18 +00:00
Peter Boyle	4ee9c68053	Updated compile environment	2024-07-08 15:57:57 +00:00
Peter Boyle	a15b4378a3	Sanitizer preservation of options	2024-07-08 15:57:45 +00:00
Peter Boyle	89fdd7f8dd	AOT compilation	2024-07-05 17:47:56 +00:00
Peter Boyle	c328be24b7	Sanitizer compile options	2024-07-05 17:46:43 +00:00
Peter Boyle	a73dc6dbf4	Display linux heap info	2024-06-28 16:05:17 +00:00
Peter Boyle	eee2a2657f	Try catch exception wrappers	2024-06-28 16:02:29 +00:00
Peter Boyle	12b8be7cb9	Best so far on 96^3 350 Evecs converged on 4^4 block	2024-06-18 16:31:37 -04:00
Peter Boyle	63c223ea5d	Verbose	2024-06-18 03:22:01 +00:00
Peter Boyle	2877fb4a2c	More verbose if alloc failure	2024-06-18 03:21:03 +00:00
Peter Boyle	d299c86633	Std::asin,acos	2024-06-11 16:41:23 -04:00
Peter Boyle	6ce52092e8	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-06-11 15:16:58 -04:00
Peter Boyle	b5926c1d21	Broadcast time info	2024-06-11 15:16:25 -04:00
Peter Boyle	9563238e9b	Force initial to identity	2024-06-11 17:51:58 +00:00
Peter Boyle	fb9b1d76ca	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-06-11 16:48:16 +00:00
Peter Boyle	1739146599	Property to initialise reduction	2024-06-11 16:47:35 +00:00
Peter Boyle	ed20b39ab3	Log files from Frontier benchmark	2024-06-11 11:16:20 -04:00
Peter Boyle	284fc05f15	Protect vs. missing LIME libarary	2024-06-11 11:08:00 -04:00
Peter Boyle	07a07b6fa3	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-06-10 15:09:25 -04:00
Peter Boyle	dc80b08969	96^3 test	2024-06-10 15:07:29 -04:00
Peter Boyle	a49a161f8d	SYCL update to use buffer on reduction variable	2024-06-08 16:05:18 +00:00
Peter Boyle	a6479ca50f	Shuhei's ComputeWilsonFlow main programme	2024-06-05 15:51:11 -04:00
Peter Boyle	0e607a55e7	Updated for 8^4 test	2024-05-26 20:53:05 +00:00
Peter Boyle	c4b9f71357	CPU compile ordering is important	2024-05-21 02:22:32 +01:00
Peter Boyle	394e506aea	Compile options for tursa update	2024-05-21 02:10:04 +01:00
Peter Boyle	e19b26341b	Tursa configure update	2024-05-21 01:14:27 +01:00
Peter Boyle	cfe1b13225	Back out zero change	2024-05-21 01:14:08 +01:00
Peter Boyle	890c5ea1cd	Warning disable	2024-05-20 20:08:31 +01:00
Peter Boyle	a87378d3b6	Update	2024-05-20 20:08:31 +01:00
Peter Boyle	832fc08809	Merge pull request #459 from dbollweg/sycl_slicesum_update Sycl slicesum bugfix	2024-05-20 15:06:53 -04:00
dbollweg	461cd045c6	sliceSum cleanup	2024-03-13 18:18:44 -04:00
dbollweg	fee65d7a75	Merge branch 'paboyle:develop' into sycl_slicesum_update	2024-03-13 18:06:17 -04:00
dbollweg	31f9971dbf	avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum	2024-03-13 13:39:26 -04:00
dbollweg	d87296f3e8	Merge branch 'develop' of https://github.com/dbollweg/Grid into develop	2024-03-06 16:54:22 -05:00
dbollweg	be94cf1c6f	Fewer wait-calls in sycl slicesum	2024-03-06 16:53:13 -05:00
		`@@ -0,0 +1,2 @@`

							`mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench`