1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Some prep work for GPU shared memory. Need to be careful, as will try GPU direct

RDMA and inter-GPU memory sharing on SUmmit later
This commit is contained in:
paboyle 2018-06-13 20:24:06 +01:00
parent 2075b177ef
commit 94d1ae4c82

View File

@ -42,9 +42,14 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
MPI_Comm_split(comm, WorldRank , 0,&WorldShmComm);
#else
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
#endif
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
std::cout << " Shared communicator of size " <<WorldShmSize << std::endl;
// WorldShmComm, WorldShmSize, WorldShmRank // WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes // WorldNodes
@ -179,6 +184,34 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//std::cerr << " allocating "<<bytes <<" bytes "<< std::endl;
if ( cudaMallocManaged(&ShmCommBuf, bytes) != cudaSuccess) {
perror("cudaMallocManaged failed ");
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) {
perror("cudaMallocManaged failed ");
exit(EXIT_FAILURE);
}
std::cerr << " Cuda allocated managed memory at "<<std::hex<<ShmCommBuf <<" - " << ((uint64_t)ShmCommBuf + bytes)<<std::dec<< std::endl;
bzero(ShmCommBuf,bytes);
WorldShmCommBufs.resize(1);
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
//std::cerr << " cudaMallocManaged Returning ; "<< bytes <<" bytes allocated at "<<std::hex<<ShmCommBuf <<std::dec<< std::endl;
}
#else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
@ -291,7 +324,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
} }
#endif #endif
#endif
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@ -306,7 +339,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
MPI_Comm_split(comm, rank, 0,&ShmComm);
#else
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
#endif
MPI_Comm_rank(ShmComm ,&ShmRank); MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize); MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize); ShmCommBufs.resize(ShmSize);
@ -318,11 +355,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
heap_size = GlobalSharedMemory::ShmAllocBytes(); heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ; uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,comm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@ -380,7 +417,6 @@ void *SharedMemory::ShmBuffer(int rank)
} }
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
static int count =0;
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){