Peer to peer on GPU's setup

2024-09-20 17:25:37 +01:00 · 2018-09-10 11:26:20 +01:00 · 2018-09-10 11:26:20 +01:00 · f02c7ea534
commit f02c7ea534
parent bc503b60e6
2 changed files with 101 additions and 29 deletions
--- a/lib/communicator/SharedMemory.h
+++ b/lib/communicator/SharedMemory.h
@ -96,6 +96,8 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
+  static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
+  static void SharedMemoryZero(void *dest,size_t bytes);

 };

@ -135,6 +137,7 @@ public:
  // Call on any instance
  ///////////////////////////////////////////////////
  void SharedMemoryTest(void);
+  
  void *ShmBufferSelf(void);
  void *ShmBuffer    (int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@ -28,6 +28,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #include <Grid/GridCore.h>

+#ifdef GRID_NVCC
+#include <cuda_runtime_api.h>
+#endif
+
 NAMESPACE_BEGIN(Grid); 

 /*Construct from an MPI communicator*/
@ -42,15 +46,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
-#ifdef GRID_NVCC
-  MPI_Comm_split(comm, WorldRank , 0,&WorldShmComm);
-#else 
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
-#endif
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
-  std::cout << " World  communicator of size " <<WorldSize << std::endl;
-  std::cout << " Shared communicator of size " <<WorldShmSize << std::endl;
+
+  std::cout << " GRID initialisation: World communicator of size " <<WorldSize << std::endl;  
+  std::cout << " GRID initialisation: Node  communicator of size " <<WorldShmSize << std::endl;
  // WorldShmComm, WorldShmSize, WorldShmRank

  // WorldNodes
@ -192,26 +193,74 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);

+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the pointer array for shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);

-  //std::cerr << " allocating "<<bytes <<"  bytes "<< std::endl;
-  auto err =  cudaMallocManaged(&ShmCommBuf, bytes);
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Each MPI rank should allocate our own buffer
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  auto err =  cudaMalloc(&ShmCommBuf, bytes);
  if ( err !=  cudaSuccess) {
    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
    exit(EXIT_FAILURE);  
  }
-
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
-  std::cerr << " Cuda allocated managed memory at "<<std::hex<<ShmCommBuf <<" - " << ((uint64_t)ShmCommBuf + bytes)<<std::dec<< std::endl;
-  bzero(ShmCommBuf,bytes);
+  SharedMemoryZero(ShmCommBuf,bytes);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Loop over ranks/gpu's on our node
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  for(int r=0;r<WorldShmSize;r++){
+    
+    //////////////////////////////////////////////////
+    // If it is me, pass around the IPC access key
+    //////////////////////////////////////////////////
+    cudaIpcMemHandle_t handle;
+
+    if ( r==WorldShmRank ) { 
+      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      if ( err !=  cudaSuccess) {
+	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+    //////////////////////////////////////////////////
+    // Share this IPC handle across the Shm Comm
+    //////////////////////////////////////////////////
+    { 
+      int ierr=MPI_Bcast(&handle,
+			 sizeof(handle),
+			 MPI_BYTE,
+			 r,
+			 WorldShmComm);
+      assert(ierr==0);
+    }
+    
+    ///////////////////////////////////////////////////////////////
+    // If I am not the source, overwrite thisBuf with remote buffer
+    ///////////////////////////////////////////////////////////////
+    void * thisBuf = ShmCommBuf;
+    if ( r!=WorldShmRank ) { 
+      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      if ( err !=  cudaSuccess) {
+	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+    ///////////////////////////////////////////////////////////////
+    // Save a copy of the device buffers
+    ///////////////////////////////////////////////////////////////
+    WorldShmCommBufs[r] = thisBuf;
+  }

-  WorldShmCommBufs.resize(1);
-  WorldShmCommBufs[0] = ShmCommBuf;
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
-  //std::cerr << " cudaMallocManaged Returning ; "<< bytes <<" bytes allocated at "<<std::hex<<ShmCommBuf <<std::dec<< std::endl;
 }
 #else 
 #ifdef GRID_MPI3_SHMMMAP
@ -326,7 +375,27 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes = bytes;
 }
 #endif
+#endif // End NVCC case for GPU device buffers
+
+/////////////////////////////////////////////////////////////////////////
+// Routines accessing shared memory should route through for GPU safety
+/////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
+{
+#ifdef GRID_NVCC
+  cudaMemset(dest,0,bytes);
+#else
+  bzero(dest,bytes);
 #endif
+}
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
+{
+#ifdef GRID_NVCC
+  cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
+#else   
+  bcopy(src,dest,bytes);
+#endif
+}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
@ -341,11 +410,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
-#ifdef GRID_NVCC
-  MPI_Comm_split(comm, rank, 0,&ShmComm);
-#else 
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
-#endif
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  ShmCommBufs.resize(ShmSize);
@ -374,6 +439,8 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)

  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
+
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@ -388,24 +455,26 @@ void SharedMemory::ShmBarrier(void)
 void SharedMemory::SharedMemoryTest(void)
 {
  ShmBarrier();
+  uint64_t check[3];
+  uint64_t magic = 0x5A5A5A;
  if ( ShmRank == 0 ) {
-    for(int r=0;r<ShmSize;r++){
-      uint64_t * check = (uint64_t *) ShmCommBufs[r];
-      check[0] = GlobalSharedMemory::WorldNode;
-      check[1] = r;
-      check[2] = 0x5A5A5A;
+    for(uint64_t r=0;r<ShmSize;r++){
+       check[0]=GlobalSharedMemory::WorldNode;
+       check[1]=r;
+       check[2]=magic;
+       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
    }
  }
  ShmBarrier();
-  for(int r=0;r<ShmSize;r++){
-    uint64_t * check = (uint64_t *) ShmCommBufs[r];
-    
+  for(uint64_t r=0;r<ShmSize;r++){
+    ShmBarrier();
+    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
+    ShmBarrier();
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
-    assert(check[2]==0x5A5A5A);
-    
+    assert(check[2]==magic);
+    ShmBarrier();
  }
-  ShmBarrier();
 }

 void *SharedMemory::ShmBuffer(int rank)