1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Level 0 IPC set up

This commit is contained in:
peterx.a.boyle 2021-08-10 05:35:15 -07:00
parent 29a22ae603
commit 50181f16e5
3 changed files with 62 additions and 6 deletions

View File

@ -73,6 +73,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize ); assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@ -451,7 +452,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
#if defined(GRID_SYCL) //if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -488,7 +490,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
} }
#endif #endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -511,8 +513,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext= cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
ze_device_mem_alloc_desc_t zeDesc = {};
zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf);
std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
#else
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
#endif
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
@ -522,8 +532,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); // SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -533,6 +543,23 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
ze_ipc_mem_handle_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
std::cerr<<"Allocated IpcHandle rank "<<r<<" (hex) ";
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
}
std::cerr<<std::endl;
}
#endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cudaIpcMemHandle_t handle; cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
@ -569,6 +596,25 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf; void * thisBuf = ShmCommBuf;
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
std::cerr<<"Using IpcHandle rank "<<r<<" ";
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
}
std::cerr<<std::endl;
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,handle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);

View File

@ -171,7 +171,6 @@ void acceleratorInit(void)
#ifdef GRID_SYCL #ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator; cl::sycl::queue *theGridAccelerator;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -179,6 +178,10 @@ void acceleratorInit(void)
cl::sycl::device selectedDevice { selector }; cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice); theGridAccelerator = new sycl::queue (selectedDevice);
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
zeInit(0);
#endif
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"

View File

@ -233,6 +233,13 @@ inline int acceleratorIsCommunicable(void *ptr)
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#include <CL/sycl.hpp> #include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp> #include <CL/sycl/usm.hpp>
#define GRID_SYCL_LEVEL_ZERO_IPC
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
#include <level_zero/ze_api.h>
#include <CL/sycl/backend/level_zero.hpp>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern cl::sycl::queue *theGridAccelerator; extern cl::sycl::queue *theGridAccelerator;