Merge with Christoph GPT checksum debug

2025-07-15 20:46:54 +01:00 · 2025-07-15 03:06:09 +00:00
parent a77cd50b2f
commit 41f344bbd3
17 changed files with 704 additions and 65 deletions
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid);
 #endif

 enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
+enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };

 class GridBLAS {
 public:
@@ -98,6 +99,20 @@ public:
    }
  }

+#ifdef GRID_CUDA
+  cublasComputeType_t toDataType(GridBLASPrecision_t p) {
+    switch (p) {
+    case GridBLAS_PRECISION_16F:
+      return CUBLAS_COMPUTE_32F_FAST_16F;
+    case GridBLAS_PRECISION_16BF:
+      return CUBLAS_COMPUTE_32F_FAST_16BF;
+    case GridBLAS_PRECISION_TF32:
+      return CUBLAS_COMPUTE_32F_FAST_TF32;
+    default:
+      assert(0);
+    }
+  }
+#endif
  // Force construct once
  GridBLAS() { Init(); };
  ~GridBLAS() { };
@@ -138,8 +153,10 @@ public:
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn)
+		   deviceVector<ComplexD*> &Cmn,
+		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
  {
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
    gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
@@ -201,8 +218,10 @@ public:
 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexD*> &Bkn,
 		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn)
+		   deviceVector<ComplexD*> &Cmn,
+		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
  {
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
    assert(Bkn.size()==batchCount);
@@ -448,7 +467,8 @@ public:
 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
 		   deviceVector<ComplexF*> &Bkn,
 		   ComplexF beta,
-		   deviceVector<ComplexF*> &Cmn)
+		   deviceVector<ComplexF*> &Cmn,
+		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
  {
    RealD t2=usecond();
    int32_t batchCount = Amk.size();
@@ -473,6 +493,7 @@ public:
    assert(Bkn.size()==batchCount);
    assert(Cmn.size()==batchCount);
 #ifdef GRID_HIP
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
    hipblasOperation_t hOpA;
    hipblasOperation_t hOpB;
    if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -503,50 +524,67 @@ public:
    if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
-    auto err = cublasCgemmBatched(gridblasHandle,
-				  hOpA,
-				  hOpB,
-				  m,n,k,
-				  (cuComplex *) &alpha_p[0],
-				  (cuComplex **)&Amk[0], lda,
-				  (cuComplex **)&Bkn[0], ldb,
-				  (cuComplex *) &beta_p[0],
-				  (cuComplex **)&Cmn[0], ldc,
-				  batchCount);
+    cublasStatus_t err;
+    if (precision == GridBLAS_PRECISION_DEFAULT) {
+      err = cublasCgemmBatched(gridblasHandle,
+			       hOpA,
+			       hOpB,
+			       m,n,k,
+			       (cuComplex *) &alpha_p[0],
+			       (cuComplex **)&Amk[0], lda,
+			       (cuComplex **)&Bkn[0], ldb,
+			       (cuComplex *) &beta_p[0],
+			       (cuComplex **)&Cmn[0], ldc,
+			       batchCount);
+    } else {
+      cublasComputeType_t compute_precision = toDataType(precision);
+      err = cublasGemmBatchedEx(gridblasHandle,
+				hOpA,
+				hOpB,
+				m,n,k,
+				(void *) &alpha_p[0],
+				(void **)&Amk[0], CUDA_C_32F, lda,
+				(void **)&Bkn[0], CUDA_C_32F, ldb,
+				(void *) &beta_p[0],
+				(void **)&Cmn[0], CUDA_C_32F, ldc,
+				batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
+    }
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-      int64_t m64=m;
-      int64_t n64=n;
-      int64_t k64=k;
-      int64_t lda64=lda;
-      int64_t ldb64=ldb;
-      int64_t ldc64=ldc;
-      int64_t batchCount64=batchCount;
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    int64_t m64=m;
+    int64_t n64=n;
+    int64_t k64=k;
+    int64_t lda64=lda;
+    int64_t ldb64=ldb;
+    int64_t ldc64=ldc;
+    int64_t batchCount64=batchCount;
    
-      oneapi::mkl::transpose iOpA;
-      oneapi::mkl::transpose iOpB;
+    oneapi::mkl::transpose iOpA;
+    oneapi::mkl::transpose iOpB;
    
-      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
-      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
-      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
-      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
-      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
-      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
+    if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
+    if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
+    if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
+    if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
+    if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
+    if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
    
-      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
-						  &iOpA,
-						  &iOpB,
-						  &m64,&n64,&k64,
-						  (ComplexF *) &alpha_p[0],
-						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
-						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
-						  (ComplexF *) &beta_p[0],
-						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
-						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						&iOpA,
+						&iOpB,
+						&m64,&n64,&k64,
+						(ComplexF *) &alpha_p[0],
+						(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						(ComplexF *) &beta_p[0],
+						(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						(int64_t)1,&batchCount64,std::vector<sycl::event>());
    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+    assert(precision == GridBLAS_PRECISION_DEFAULT);
    // Need a default/reference implementation; use Eigen
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
@@ -946,6 +984,336 @@ public:
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
  }

+  /*
+    Inverse and Determinant
+
+    - CPU version uses Eigen
+    - GPU version uses LAPACK-compatible getrf / getri
+
+    Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
+                    Overhead to go through getrf / getri for CPU version too large.
+		    Current interface therefore only guarantees the inverse and determinant
+		    functions on all platforms but not the getrf / getri ones.
+  */
+#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
+
+  void inverseBatched(int64_t n,
+		      deviceVector<ComplexD*> &Ann,
+		      deviceVector<ComplexD*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == Cnn.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
+	Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
+	eCnn = eAnn.inverse();
+      });
+  }
+
+  void inverseBatched(int64_t n,
+		      deviceVector<ComplexF*> &Ann,
+		      deviceVector<ComplexF*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == Cnn.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
+	Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
+	eCnn = eAnn.inverse();
+      });
+  }
+
+  void determinantBatched(int64_t n,
+			  deviceVector<ComplexD*> &Ann,
+			  deviceVector<ComplexD*> &C) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == C.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
+	*C[p] = eAnn.determinant();
+      });
+  }
+
+  void determinantBatched(int64_t n,
+			  deviceVector<ComplexF*> &Ann,
+			  deviceVector<ComplexF*> &C) {
+
+    int64_t batchCount = Ann.size();
+    assert(batchCount == C.size());
+    thread_for(p,batchCount, {
+	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
+	*C[p] = eAnn.determinant();
+      });
+  }
+  
+#else
+
+#ifdef GRID_SYCL
+  template<typename T>
+  void getrfBatchedSYCL(int64_t n,
+			deviceVector<T*> &Ann,
+			deviceVector<int64_t> &ipiv,
+			deviceVector<int64_t> &info) {
+    
+    int64_t batchCount = Ann.size();
+
+    static deviceVector<T> scratchpad;
+    int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
+    if (sp_size > scratchpad.size())
+      scratchpad.resize(sp_size);
+
+    static deviceVector<int64_t*> _ipiv;
+    if (batchCount > _ipiv.size())
+      _ipiv.resize(batchCount);
+    int64_t** p_ipiv = &_ipiv[0];
+    int64_t* pipiv = &ipiv[0];
+
+    accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
+
+    oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
+				    &n, &n,
+				    (T **)&Ann[0],
+				    &n,
+				    (int64_t**)&_ipiv[0],
+				    (int64_t)1, &batchCount,
+				    (T*)&scratchpad[0], (int64_t)scratchpad.size(),
+				    std::vector<sycl::event>());
+    synchronise();
+  }
+#endif
+
+  void getrfBatched(int64_t n,
+		    deviceVector<ComplexD*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount*n);
+    assert(info.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
+				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (int*) &info[0],
+				    (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
+				   (cuDoubleComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    getrfBatchedSYCL(n, Ann, ipiv, info);
+#endif
+  }
+
+  void getrfBatched(int64_t n,
+		    deviceVector<ComplexF*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount*n);
+    assert(info.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
+				    (hipblasComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (int*) &info[0],
+				    (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
+				   (cuComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    getrfBatchedSYCL(n, Ann, ipiv, info);
+#endif
+  }
+
+#ifdef GRID_SYCL
+  template<typename T>
+  void getriBatchedSYCL(int64_t n,
+			deviceVector<T*> &Ann,
+			deviceVector<int64_t> &ipiv,
+			deviceVector<int64_t> &info,
+			deviceVector<T*> &Cnn) {
+
+    int64_t batchCount = Ann.size();
+
+    static deviceVector<T> scratchpad;
+    int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
+    if (sp_size > scratchpad.size())
+      scratchpad.resize(sp_size);
+
+    static deviceVector<int64_t*> _ipiv;
+    if (batchCount > _ipiv.size())
+      _ipiv.resize(batchCount);
+    int64_t** p_ipiv = &_ipiv[0];
+    int64_t* pipiv = &ipiv[0];
+
+    accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
+
+    oneapi::mkl::lapack::getri_batch(*gridblasHandle,
+				     &n,
+				     (T **)&Ann[0],
+				     &n,
+				     (int64_t**)p_ipiv,
+				     (int64_t)1, &batchCount,
+				     (T *)&scratchpad[0], (int64_t)scratchpad.size(),
+				     std::vector<sycl::event>());
+
+    synchronise();
+
+    T** pA = &Ann[0];
+    T** pC = &Cnn[0];
+    accelerator_for(i, batchCount*n*n, 1, {
+	auto j = i / batchCount;
+	auto k = i % batchCount;
+	pC[k][j] = pA[k][j];
+      });
+  }
+
+#endif
+
+  void getriBatched(int64_t n,
+		    deviceVector<ComplexD*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info,
+		    deviceVector<ComplexD*> &Cnn)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount*n);
+    assert(info.size()==batchCount);
+    assert(Cnn.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
+				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+				    (int*) &info[0],
+				    (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    auto err = cublasZgetriBatched(gridblasHandle, (int)n,
+				   (cuDoubleComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (cuDoubleComplex **)&Cnn[0], (int)n,
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
+#endif
+  }
+
+  void getriBatched(int64_t n,
+		    deviceVector<ComplexF*> &Ann,
+		    deviceVector<int64_t> &ipiv,
+		    deviceVector<int64_t> &info,
+		    deviceVector<ComplexF*> &Cnn)
+  {
+    int64_t batchCount = Ann.size();
+    assert(ipiv.size()==batchCount*n);
+    assert(info.size()==batchCount);
+    assert(Cnn.size()==batchCount);
+
+#ifdef GRID_HIP
+    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
+				    (hipblasComplex **)&Ann[0], (int)n,
+				    (int*) &ipiv[0],
+				    (hipblasComplex **)&Cnn[0], (int)n,
+				    (int*) &info[0],
+				    (int)batchCount);
+    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_CUDA
+    auto err = cublasCgetriBatched(gridblasHandle, (int)n,
+				   (cuComplex **)&Ann[0], (int)n,
+				   (int*) &ipiv[0],
+				   (cuComplex **)&Cnn[0], (int)n,
+				   (int*) &info[0],
+				   (int)batchCount);
+    assert(err==CUBLAS_STATUS_SUCCESS);
+#endif
+#ifdef GRID_SYCL
+    getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
+#endif
+  }
+
+  template<typename dtype>
+  void inverseBatched(int64_t n,
+		      deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
+		      deviceVector<dtype*> &Cnn  // this will be overwritten with the inverse
+		      ) {
+
+    int64_t batchCount = Ann.size();
+    RealD t0 = usecond();
+    deviceVector<int64_t> ipiv(batchCount*n);
+    deviceVector<int64_t> info(batchCount);
+
+    //RealD t1 = usecond();
+    getrfBatched(n, Ann, ipiv, info);
+    // test info for non-invertibility?  set to nan if yes?
+    getriBatched(n, Ann, ipiv, info, Cnn);
+    //synchronise();
+    //RealD t2 = usecond();
+    //std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1  << std::endl;
+  }
+
+  template<typename dtype>
+  void determinantBatched(int64_t n,
+			  deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
+			  deviceVector<dtype*> &C    // this will be overwritten with determinant
+			  ) {
+
+    int64_t batchCount = Ann.size();
+    //RealD t0 = usecond();
+    deviceVector<int64_t> ipiv(batchCount*n);
+    deviceVector<int64_t> info(batchCount);
+    
+    dtype** pAnn = (dtype**)&Ann[0];
+    dtype** pC = (dtype**)&C[0];
+#if defined(GRID_CUDA) || defined(GRID_HIP)
+    int* pipiv = (int*)&ipiv[0];
+#else
+    int64_t* pipiv = (int64_t*)&ipiv[0];
+#endif
+
+    //RealD t1 = usecond();
+    getrfBatched(n, Ann, ipiv, info);
+    //RealD t2 = usecond();
+    accelerator_for(i,batchCount,1,{
+	dtype det = 1.0;
+	for (int64_t j=0;j<n;j++) {
+	  det *= pAnn[i][n*j + j];
+	  // branchless signs
+	  det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
+	}
+	*pC[i] = det;
+      });
+    
+    //RealD t3 = usecond();
+    //std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1  << "final" << t3 - t2 << std::endl;
+  }
+#endif
+  
+  
  template<class CComplex>
  double benchmark(int M, int N, int K, int BATCH)
  {
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -32,6 +32,10 @@ NAMESPACE_BEGIN(Grid);


 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
+#ifdef GRID_CHECKSUM_COMMS
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
+uint64_t checksum_index = 1;
+#endif

 ////////////////////////////////////////////
 // First initialise of comms system
@@ -568,6 +572,11 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   * - post device - host send buffer transfer asynch
   */
  
+#ifdef GRID_CHECKSUM_COMMS
+  rbytes += 8;
+  xbytes += 8;
+#endif
+
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
@@ -580,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
      srq.req        = rrq;
      srq.host_buf   = host_recv;
      srq.device_buf = recv;
+      srq.tag        = tag;
      list.push_back(srq);
      off_node_bytes+=rbytes;
    }
@@ -593,7 +603,16 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
      host_xmit = this->HostBufferMalloc(xbytes);
      CommsRequest_t srq;

+#ifdef GRID_CHECKSUM_COMMS
+      uint64_t xbytes_data = xbytes - 8;
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
+      assert(xbytes % 8 == 0);
+      // flip one bit so that a zero buffer is not consistent
+      uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag); 
+      *(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
+#else
      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+#endif
      
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
@@ -635,7 +654,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ

 	if ( flag ) {
 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+#ifdef GRID_CHECKSUM_COMMS
+ 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
+#else
 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+#endif
 	  list[idx].PacketType=InterNodeReceiveHtoD;
 	} else {
 	  pending ++;
@@ -798,7 +821,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
  //    }
  //  }
+#ifdef GRID_CHECKSUM_COMMS
+  for(int r=0;r<list.size();r++){
+    if ( list[r].PacketType == InterNodeReceiveHtoD ) {
+      uint64_t rbytes_data = list[r].bytes - 8;
+      uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
+      uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
+      if (expected_cs != computed_cs) {
+	// TODO: error message, backtrace, quit

+	fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
+	fprintf(stderr, " processor = %d\n", (int)_processor);
+	for(int d=0;d<_processors.size();d++)
+	  fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
+	fprintf(stderr, " hostname: %s\n", GridHostname());
+	fprintf(stderr, " expected_cs: %ld\n", expected_cs);
+	fprintf(stderr, " computed_cs: %ld\n", computed_cs);
+	fprintf(stderr, " dest: %d\n", list[r].dest);
+	fprintf(stderr, " tag: %d\n", list[r].tag);
+	fprintf(stderr, " commdir: %d\n", list[r].commdir);
+	fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
+
+	fflush(stderr);
+
+	// backtrace
+	int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
+	backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
+
+	exit(1);
+      }
+    }
+  }
+
+  checksum_index += 1;
+#endif
  
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -43,10 +43,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
 #else
-#ifdef HAVE_NUMAIF_H
-  #warning " Using NUMAIF "
-#include <numaif.h>
-#endif 
 #endif 
 #include <syscall.h>
 #endif
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
  {
    auto buffer_p = & buffer[0];
    auto table = MapCshiftTable();
-    autoView( rhs_v, rhs, AcceleratorWrite);
+    autoView( rhs_v, rhs, AcceleratorWriteDiscard);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
-    autoView( rhs_v , rhs, AcceleratorWrite);
+    autoView( rhs_v , rhs, AcceleratorWriteDiscard);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
 	int b = nn/e1;
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  {
    auto table = MapCshiftTable();
    autoView(rhs_v , rhs, AcceleratorRead);
-    autoView(lhs_v , lhs, AcceleratorWrite);
+    autoView(lhs_v , lhs, AcceleratorWriteDiscard);
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -29,8 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_

-
 NAMESPACE_BEGIN(Grid); 
+
+#ifdef GRID_CHECKSUM_COMMS
+extern uint64_t checksum_index;
+#endif
+
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
@@ -126,8 +130,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
-  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
-  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+  int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
 #endif
  
  int cb= (cbmask==0x2)? Odd : Even;
@@ -180,20 +185,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 #else
      // bouncy bouncy
      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+
+#ifdef GRID_CHECKSUM_COMMS
+      assert(bytes % 8 == 0);
+      checksum_index++;
+      uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
+      *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
+      bytes += 8;
+#endif
+
      grid->SendToRecvFrom((void *)&hsend_buf[0],
 			   xmit_to_rank,
 			   (void *)&hrecv_buf[0],
 			   recv_from_rank,
 			   bytes);
+
+#ifdef GRID_CHECKSUM_COMMS
+      bytes -= 8;
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+      uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
+      uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
      std::cout << GridLogComms<< " Cshift: "
 		<<" dim"<<dimension
 		<<" shift "<<shift
 		<< " rank "<< grid->ThisRank()
 		<<" Coor "<<grid->ThisProcessorCoor()
-		<<" send "<<hsend_buf[0]<<" to   "<<xmit_to_rank
-		<<" recv "<<hrecv_buf[0]<<" from "<<recv_from_rank
+		<<" send "<<xsum<<" to   "<<xmit_to_rank
+		<<" recv "<<computed_cs<<" from "<<recv_from_rank
 		<<std::endl;
+      assert(expected_cs == computed_cs);
+#else
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
 #endif
      FlightRecorder::StepLog("Cshift_SendRecv_complete");

@@ -264,8 +288,16 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    recv_buf_extract[s].resize(buffer_size);
  }
 #ifndef ACCELERATOR_AWARE_MPI
-  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
-  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#ifdef GRID_CHECKSUM_COMMS
+  buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
+#endif
+
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
+
+#ifdef GRID_CHECKSUM_COMMS
+  buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
+#endif
 #endif
  
  int bytes = buffer_size*sizeof(scalar_object);
@@ -328,21 +360,37 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 #else
      // bouncy bouncy
 	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+#ifdef GRID_CHECKSUM_COMMS
+	assert(bytes % 8 == 0);
+	checksum_index++;
+	uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
+	*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
+	bytes += 8;
+#endif
 	grid->SendToRecvFrom((void *)&hsend_buf[0],
 			     xmit_to_rank,
 			     (void *)&hrecv_buf[0],
 			     recv_from_rank,
 			     bytes);
+#ifdef GRID_CHECKSUM_COMMS
+	bytes -= 8;
 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+	uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
+	uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);

 	std::cout << GridLogComms<< " Cshift_comms_simd: "
 		<<" dim"<<dimension
 		<<" shift "<<shift
 		<< " rank "<< grid->ThisRank()
 		<<" Coor "<<grid->ThisProcessorCoor()
-		<<" send "<<hsend_buf[0]<<" to   "<<xmit_to_rank
-		<<" recv "<<hrecv_buf[0]<<" from "<<recv_from_rank
+		<<" send "<<xsum<<" to   "<<xmit_to_rank
+		<<" recv "<<computed_cs<<" from "<<recv_from_rank
 		<<std::endl;
+	assert(expected_cs == computed_cs);
+#else
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif
+
 #endif

 	xbytes+=bytes;
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -87,6 +87,25 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
  theGridAccelerator->wait();
  return ret;
 }
+template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
+{
+  Word identity;  identity=0;
+  Word ret = 0;
+  { 
+    sycl::buffer<Word, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(sycl::range<1>{L},
+                       Reduction,
+                       [=] (sycl::id<1> index, auto &sum) {
+			 auto l = index % 61;
+                         sum ^= vec[index]<<l | vec[index]>>(64-l);
+                       });
+    });
+  }
+  theGridAccelerator->wait();
+  return ret;
+}

 NAMESPACE_END(Grid);

--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -106,6 +106,47 @@ public:
  }

 };
+
+#ifdef GRID_LOG_VIEWS
+// Little autoscope assister
+template<class View> 
+class ViewCloser
+{
+  View v;  // Take a copy of view and call view close when I go out of scope automatically
+  const char* filename; int line, mode;
+public:
+  ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
+    v(_v), filename(_filename), line(_line), mode(_mode) {
+    
+    switch (mode){
+    case AcceleratorRead:
+    case AcceleratorWrite:
+    case CpuRead:
+    case CpuWrite:
+      ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
+      break;
+    } 
+    
+  };
+  ~ViewCloser() {
+    
+    switch (mode) {
+    case AcceleratorWriteDiscard:
+    case AcceleratorWrite:
+    case CpuWrite:
+      ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
+      break;
+    }
+    
+    v.ViewClose();
+  }
+};
+
+#define autoView(l_v,l,mode)				\
+	  auto l_v = l.View(mode);			\
+	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
+
+#else
 // Little autoscope assister
 template<class View> 
 class ViewCloser
@@ -119,6 +160,7 @@ class ViewCloser
 #define autoView(l_v,l,mode)				\
 	  auto l_v = l.View(mode);			\
 	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
+#endif

 /////////////////////////////////////////////////////////////////////////////////////////
 // Lattice expression types used by ET to assemble the AST
--- a/Grid/qcd/action/fermion/ZMobiusFermion.h
+++ b/Grid/qcd/action/fermion/ZMobiusFermion.h
@@ -57,7 +57,7 @@ public:

  {
    //    RealD eps = 1.0;
-    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+    //    std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
    std::vector<Coeff_t> zgamma(this->Ls);
    for(int s=0;s<this->Ls;s++){
      zgamma[s] = gamma[s];
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 {
    autoView(U_v  ,  U,AcceleratorRead);
    autoView(in_v , in,AcceleratorRead);
-    autoView(out_v,out,AcceleratorWrite);
+    autoView(out_v,out,AcceleratorWriteDiscard);
    autoView(st_v , st,AcceleratorRead);
    KERNEL_CALL_ID(GenericDhopSite);
 }
--- a/Grid/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -118,7 +118,7 @@ protected:
                             GaugeK);  // derivative of SmearBase
    return SigmaK;
  }
-
+public:
  /*! @brief Returns smeared configuration at level 'Level' */
  const GaugeField &get_smeared_conf(int Level) const
  {
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -819,7 +819,7 @@ public:
    }  // if smearingLevels = 0 do nothing
  }

-private:
+public:
  //====================================================================
  // Override base clas here to mask it
  virtual void fill_smearedSet(GaugeField &U)
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -192,6 +192,11 @@ public:

  void ViewClose(void)  {  }

+#ifdef GRID_LOG_VIEWS
+  size_t size() { return 0; };
+  uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
+#endif
+  
 };

 ////////////////////////////////////////
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -372,4 +372,53 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
  }
 }

+#ifdef GRID_LOG_VIEWS
+
+bool ViewLogger::Enabled = false;
+std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
+
+void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
+void ViewLogger::End() { Enabled = false; }
+
+void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
+{
+  if (!Enabled)
+   return;
+   
+  size_t i = LogVector.size();
+  LogVector.resize(i + 1);
+  auto & n = LogVector[i];
+
+  n.filename = filename;
+  n.line = line;
+  n.index = index;
+
+  if (bytes < sizeof(uint64_t)) {
+    
+    n.head = n.tail = 0;
+    
+  } else {
+  
+    switch (mode) {
+    case AcceleratorRead:
+    case AcceleratorWrite:
+    case AcceleratorWriteDiscard:
+      acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t));
+      acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t));
+      break;
+      
+    case CpuRead:
+    case CpuWrite:
+      //case CpuWriteDiscard:
+      n.head = *(uint64_t*)data;
+      n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t));
+      break;
+    }
+  }
+}
+
+#endif
+
+
+
 NAMESPACE_END(Grid);
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@@ -42,5 +42,22 @@ class FlightRecorder {
  static void xmitLog(void *,uint64_t bytes);
  static void recvLog(void *,uint64_t bytes,int rank);
 };
+#ifdef GRID_LOG_VIEWS
+class ViewLogger {
+  struct Entry_t {
+    const char* filename;
+    int line;
+    int index;
+    uint64_t head, tail;
+  };
+
+public:
+  static bool Enabled;
+  static std::vector<Entry_t> LogVector;
+  static void Begin();
+  static void End();
+  static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
+};
+#endif
 NAMESPACE_END(Grid);

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -416,12 +416,28 @@ void Grid_init(int *argc,char ***argv)
  } else { 
    FILE *fp;
    std::ostringstream fname;
+
+    int rank = CartesianCommunicator::RankWorld();
+    int radix=64;
+    char* root = getenv("GRID_STDOUT_ROOT");
+    if (root) {
+      fname << root ;
+      mkdir(fname.str().c_str(), S_IRWXU );
+      fname << "/";
+    }
+    fname << (rank/radix)*radix ;
+    mkdir(fname.str().c_str(), S_IRWXU );
+    fname << "/";
    fname<<"Grid.stdout.";
    fname<<CartesianCommunicator::RankWorld();
    fp=freopen(fname.str().c_str(),"w",stdout);
    assert(fp!=(FILE *)NULL);

    std::ostringstream ename;
+    if (root){
+      ename << root << "/";
+    }
+    ename << (rank/radix)*radix << "/";
    ename<<"Grid.stderr.";
    ename<<CartesianCommunicator::RankWorld();
    fp=freopen(ename.str().c_str(),"w",stderr);
--- a/configure.ac
+++ b/configure.ac
@@ -222,7 +222,7 @@ esac

 ############### Symplectic group
 AC_ARG_ENABLE([Sp],
-    [AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
+    [AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])],
    [ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])

 AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])
@@ -263,6 +263,28 @@ case ${ac_ACCELERATOR_AWARE_MPI} in
    *);;
 esac

+############### CHECKSUM COMMS
+AC_ARG_ENABLE([checksum-comms],
+    [AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
+    [ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
+
+case ${ac_CHECKSUM_COMMS} in
+    yes)
+      AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);;
+    *);;
+esac
+
+############### LOG VIEWS
+AC_ARG_ENABLE([log-views],
+    [AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
+    [ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
+
+case ${ac_LOG_VIEWS} in
+    yes)
+      AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);;
+    *);;
+esac
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -10,10 +10,11 @@ export UNWIND=`spack find --paths libunwind  | grep ^libunwind  | awk '{print $2
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
+	--enable-checksum-comms=yes \
+	--enable-log-views=yes \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=no \
 	--enable-unified=no \
-	--enable-debug \
 	--with-lime=$CLIME \
 	--with-gmp=$GMP \
 	--with-mpfr=$MPFR \