From 4f89f603aef1329def2d80da1d05847ee771cb64 Mon Sep 17 00:00:00 2001 From: Peter Boyle <paboyle@ph.ed.ac.uk> Date: Fri, 4 Apr 2025 18:34:36 -0400 Subject: [PATCH] Changes to add back shared memory test on GPU --- Grid/communicator/SharedMemoryMPI.cc | 18 +++------ Grid/threads/Accelerator.h | 55 ++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 1efa6eed..c61470f5 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; + // std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl; } ShmBufferFreeAll(); @@ -1011,29 +1012,20 @@ void SharedMemory::SharedMemoryTest(void) check[0]=GlobalSharedMemory::WorldNode; check[1]=r; check[2]=magic; - // std::cerr << " ShmRank "<<ShmRank<<" storing "<<GlobalSharedMemory::WorldNode<<","<<r<<","<<std::hex<<magic<<" to buf "<<ShmCommBufs[r] - // <<std::dec<<std::endl; - acceleratorPut(ShmCommBufs[r][0],check[0]); - acceleratorPut(ShmCommBufs[r][1],check[1]); - acceleratorPut(ShmCommBufs[r][2],check[2]); - // GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t)); + acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t)); } } ShmBarrier(); for(uint64_t r=0;r<ShmSize;r++){ - ShmBarrier(); - // GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t)); + acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t)); + // accelerator_barrier(); // std::cerr << " ShmRank "<<ShmRank<<" read "<<check[0]<<","<<check[1]<<","<<std::hex<<check[2]<<" from buf "<<ShmCommBufs[r] // <<std::dec<<std::endl; - check[0] = acceleratorGet(ShmCommBufs[r][0]); - check[1] = acceleratorGet(ShmCommBufs[r][1]); - check[2] = acceleratorGet(ShmCommBufs[r][2]); - ShmBarrier(); assert(check[0]==GlobalSharedMemory::WorldNode); assert(check[1]==r); assert(check[2]==magic); - ShmBarrier(); } + ShmBarrier(); } void *SharedMemory::ShmBuffer(int rank) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index b2a40e7b..0cf887f9 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; +typedef int acceleratorEvent_t; + inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} -inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} -inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { + acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice); + return 0; +} +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { + acceleratorCopyFromDevice(from,to,bytes); + return 0; +} +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); + return 0; } inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); }; +inline void acceleratorEventWait(acceleratorEvent_t ev) +{ + //auto discard=cudaStreamSynchronize(ev); +} +inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;} inline int acceleratorIsCommunicable(void *ptr) @@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) inline void *acceleratorAllocHost(size_t bytes) { void *ptr=NULL; - auto err = hipMallocHost((void **)&ptr,bytes); + auto err = hipHostMalloc((void **)&ptr,bytes); if( err != hipSuccess ) { ptr = (void *) NULL; fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr); @@ -516,18 +530,30 @@ inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ a inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch +typedef int acceleratorEvent_t; + +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); + return 0; } -inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { - auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { + acceleratorCopyToDevice(from,to,bytes); + return 0; } -inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { - auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { + acceleratorCopyFromDevice(from,to,bytes); + return 0; } inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; +inline void acceleratorEventWait(acceleratorEvent_t ev) +{ + // auto discard=hipStreamSynchronize(ev); +} +inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;} + + #endif inline void acceleratorPin(void *ptr,unsigned long bytes) @@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes) #undef GRID_SIMT +typedef int acceleratorEvent_t; + inline void acceleratorMem(void) { /* @@ -583,9 +611,12 @@ inline void acceleratorMem(void) accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } -inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; } +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; } +inline void acceleratorEventWait(acceleratorEvent_t ev){} +inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;} +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;} + inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; }