From 4f89f603aef1329def2d80da1d05847ee771cb64 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 4 Apr 2025 18:34:36 -0400
Subject: [PATCH] Changes to add back shared memory test on GPU

---
 Grid/communicator/SharedMemoryMPI.cc | 18 +++------
 Grid/threads/Accelerator.h           | 55 ++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 1efa6eed..c61470f5 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
     MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
 
     ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
+    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
   }
   ShmBufferFreeAll();
 
@@ -1011,29 +1012,20 @@ void SharedMemory::SharedMemoryTest(void)
        check[0]=GlobalSharedMemory::WorldNode;
        check[1]=r;
        check[2]=magic;
-       //       std::cerr << " ShmRank "<<ShmRank<<" storing "<<GlobalSharedMemory::WorldNode<<","<<r<<","<<std::hex<<magic<<" to buf "<<ShmCommBufs[r]
-       //		 <<std::dec<<std::endl;
-       acceleratorPut(ShmCommBufs[r][0],check[0]);
-       acceleratorPut(ShmCommBufs[r][1],check[1]);
-       acceleratorPut(ShmCommBufs[r][2],check[2]);
-       //       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
+       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
     }
   }
   ShmBarrier();
   for(uint64_t r=0;r<ShmSize;r++){
-    ShmBarrier();
-    //    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
+    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
+    //      accelerator_barrier();
     //    std::cerr << " ShmRank "<<ShmRank<<" read "<<check[0]<<","<<check[1]<<","<<std::hex<<check[2]<<" from buf "<<ShmCommBufs[r]
     //	      <<std::dec<<std::endl;
-    check[0] = acceleratorGet(ShmCommBufs[r][0]);
-    check[1] = acceleratorGet(ShmCommBufs[r][1]);
-    check[2] = acceleratorGet(ShmCommBufs[r][2]);
-    ShmBarrier();
     assert(check[0]==GlobalSharedMemory::WorldNode);
     assert(check[1]==r);
     assert(check[2]==magic);
-    ShmBarrier();
   }
+  ShmBarrier();
 }
 
 void *SharedMemory::ShmBuffer(int rank)
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index b2a40e7b..0cf887f9 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
   return ptr;
 };
 
+typedef int acceleratorEvent_t;
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
+}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
   cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //auto discard=cudaStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 
 
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 inline void *acceleratorAllocHost(size_t bytes)
 {
   void *ptr=NULL;
-  auto err = hipMallocHost((void **)&ptr,bytes);
+  auto err = hipHostMalloc((void **)&ptr,bytes);
   if( err != hipSuccess ) {
     ptr = (void *) NULL;
     fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@@ -516,18 +530,30 @@ inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ a
 
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
 
-inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
+typedef int acceleratorEvent_t;
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
   auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  return 0;
 }
-inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyToDevice(from,to,bytes);
+  return 0;
 }
-inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
-  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+  acceleratorCopyFromDevice(from,to,bytes);
+  return 0;
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
 
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  //  auto discard=hipStreamSynchronize(ev);
+}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
+
+
 #endif
 
 inline void acceleratorPin(void *ptr,unsigned long bytes)
@@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
 
 #undef GRID_SIMT
 
+typedef int acceleratorEvent_t;
+
 inline void acceleratorMem(void)
 {
   /*
@@ -583,9 +611,12 @@ inline void acceleratorMem(void)
 
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 
-inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
+inline void acceleratorEventWait(acceleratorEvent_t ev){}
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
+
 inline void acceleratorCopySynchronise(void) {};
 
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }