Changes to add back shared memory test on GPU

2025-12-13 09:14:40 +00:00 · 2025-04-04 18:34:36 -04:00
parent 11dc2c5e1d
commit 4f89f603ae
2 changed files with 48 additions and 25 deletions
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
  }
  ShmBufferFreeAll();
@@ -1011,29 +1012,20 @@ void SharedMemory::SharedMemoryTest(void)
       check[0]=GlobalSharedMemory::WorldNode;
       check[1]=r;
       check[2]=magic;
-       //       std::cerr << " ShmRank "<<ShmRank<<" storing "<<GlobalSharedMemory::WorldNode<<","<<r<<","<<std::hex<<magic<<" to buf "<<ShmCommBufs[r]
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
       //		 <<std::dec<<std::endl;
       acceleratorPut(ShmCommBufs[r][0],check[0]);
       acceleratorPut(ShmCommBufs[r][1],check[1]);
       acceleratorPut(ShmCommBufs[r][2],check[2]);
       //       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
  for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
-    //    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
+    //      accelerator_barrier();
    //    std::cerr << " ShmRank "<<ShmRank<<" read "<<check[0]<<","<<check[1]<<","<<std::hex<<check[2]<<" from buf "<<ShmCommBufs[r]
    //	      <<std::dec<<std::endl;
    check[0] = acceleratorGet(ShmCommBufs[r][0]);
    check[1] = acceleratorGet(ShmCommBufs[r][1]);
    check[2] = acceleratorGet(ShmCommBufs[r][2]);
    ShmBarrier();
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==magic);
    ShmBarrier();
  }
  ShmBarrier();
 }
 void *SharedMemory::ShmBuffer(int rank)
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };
 typedef int acceleratorEvent_t;
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
 inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
  acceleratorCopyFromDevice(from,to,bytes);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
  //auto discard=cudaStreamSynchronize(ev);
 }
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
  void *ptr=NULL;
-  auto err = hipMallocHost((void **)&ptr,bytes);
+  auto err = hipHostMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@@ -516,18 +530,30 @@ inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ a
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+  acceleratorCopyToDevice(from,to,bytes);
  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+  acceleratorCopyFromDevice(from,to,bytes);
  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
 inline void acceleratorEventWait(acceleratorEvent_t ev)
 {
  //  auto discard=hipStreamSynchronize(ev);
 }
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 #endif
 inline void acceleratorPin(void *ptr,unsigned long bytes)
@@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
 #undef GRID_SIMT
 typedef int acceleratorEvent_t;
 inline void acceleratorMem(void)
 {
  /*
@@ -583,9 +611,12 @@ inline void acceleratorMem(void)
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
-inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
 inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
 inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
 inline void acceleratorCopySynchronise(void) {};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }