From 16c2a99965d6b3dc14d4c947e03dbec5e7df0195 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Oct 2021 13:31:26 -0700 Subject: [PATCH 1/2] Overlap cudamemcpy - didn't set up stream right --- Grid/communicator/Communicator_mpi3.cc | 2 +- Grid/threads/Accelerator.cc | 2 +- Grid/threads/Accelerator.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 01335b41..305a3a9b 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -389,7 +389,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); - acceleratorCopySynchronise(); // MPI prob slower } if ( CommunicatorPolicy == CommunicatorPolicySequential ) { @@ -405,6 +404,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); + acceleratorCopySynchronise(); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 14e07248..9f27f12b 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -95,7 +95,7 @@ void acceleratorInit(void) #endif cudaSetDevice(device); - + cudaStreamCreate(©Stream); const int len=64; char busid[len]; if( rank == world_rank ) { diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2c08b76b..cec0600f 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -95,6 +95,7 @@ void acceleratorInit(void); ////////////////////////////////////////////// #ifdef GRID_CUDA + #include #ifdef __CUDA_ARCH__ From 1f9688417ae1fc1fcb680d37d05983e6dccbfc9d Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Wed, 13 Oct 2021 20:45:46 +0100 Subject: [PATCH 2/2] Error message added when attempting to sum object which is too large for the shared memory --- Grid/lattice/Lattice_reduction_gpu.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index d8a47ae1..c2875052 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl; std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl; std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl; - std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl; std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; if (warpSize != WARP_SIZE) { @@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator // let the number of threads in a block be a multiple of 2, starting from warpSize threads = warpSize; + if ( threads*sizeofsobj > sharedMemPerBlock ) { + std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; + exit(EXIT_FAILURE); + } while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; // keep all the streaming multiprocessors busy blocks = nextPow2(multiProcessorCount);