Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop

2025-12-20 20:54:30 +00:00 · 2021-10-15 20:46:51 +01:00
parent cfe9e870d3 e9c4f06cbf
commit 7e0057d2c4
3 changed files with 6 additions and 2 deletions
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -389,7 +389,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
-    acceleratorCopySynchronise(); // MPI prob slower
  }

  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -405,6 +404,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  if (nreq==0) return;

  std::vector<MPI_Status> status(nreq);
+  acceleratorCopySynchronise(); 
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
-  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  
  if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
+  if ( threads*sizeofsobj > sharedMemPerBlock ) {
+    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
+    exit(EXIT_FAILURE);
+  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////

 #ifdef GRID_CUDA
+
 #include <cuda.h>

 #ifdef __CUDA_ARCH__