From 16c2a99965d6b3dc14d4c947e03dbec5e7df0195 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 11 Oct 2021 13:31:26 -0700
Subject: [PATCH 1/2] Overlap cudamemcpy - didn't set up stream right

---
 Grid/communicator/Communicator_mpi3.cc | 2 +-
 Grid/threads/Accelerator.cc            | 2 +-
 Grid/threads/Accelerator.h             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 01335b41..305a3a9b 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -389,7 +389,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
     void *shm = (void *) this->ShmBufferTranslate(dest,recv);
     assert(shm!=NULL);
     acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
-    acceleratorCopySynchronise(); // MPI prob slower
   }
 
   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -405,6 +404,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
   if (nreq==0) return;
 
   std::vector<MPI_Status> status(nreq);
+  acceleratorCopySynchronise(); 
   int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
   assert(ierr==0);
   list.resize(0);
diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index 14e07248..9f27f12b 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -95,7 +95,7 @@ void acceleratorInit(void)
 #endif
 
   cudaSetDevice(device);
-
+  cudaStreamCreate(&copyStream);
   const int len=64;
   char busid[len];
   if( rank == world_rank ) { 
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 2c08b76b..cec0600f 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 
 #ifdef GRID_CUDA
+
 #include <cuda.h>
 
 #ifdef __CUDA_ARCH__

From 1f9688417ae1fc1fcb680d37d05983e6dccbfc9d Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 13 Oct 2021 20:45:46 +0100
Subject: [PATCH 2/2] Error message added when attempting to sum object which
 is too large for the shared memory

---
 Grid/lattice/Lattice_reduction_gpu.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index d8a47ae1..c2875052 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
-  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
   
   if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   
   // let the number of threads in a block be a multiple of 2, starting from warpSize
   threads = warpSize;
+  if ( threads*sizeofsobj > sharedMemPerBlock ) {
+    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
+    exit(EXIT_FAILURE);
+  }
   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
   // keep all the streaming multiprocessors busy
   blocks = nextPow2(multiProcessorCount);