1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-13 01:05:36 +00:00

Merge branch 'develop' of https://github.com/paboyle/Grid into develop

This commit is contained in:
Peter Boyle 2021-09-22 06:03:06 -07:00
commit cffc736bb3
2 changed files with 36 additions and 33 deletions

View File

@ -513,26 +513,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext= cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_device_mem_alloc_desc_t zeDesc = {};
zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf);
std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
#else
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
#endif
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
if ( 1 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
// SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl; std::cout<< "Setting up IPC"<<std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
@ -546,20 +536,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void * thisBuf = ShmCommBuf; void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) { if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
ze_ipc_mem_handle_t handle; typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle); auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { } else {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
std::cerr<<"Allocated IpcHandle rank "<<r<<" (hex) "; memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){ handle.pid = getpid();
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
}
std::cerr<<std::endl;
} }
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@ -602,18 +596,27 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
thisBuf = nullptr; thisBuf = nullptr;
std::cerr<<"Using IpcHandle rank "<<r<<" "; std::cout<<"mapping seeking remote pid/fd "
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){ <<handle.pid<<"/"
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec; <<handle.fd<<std::endl;
}
std::cerr<<std::endl; int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,handle,0,&thisBuf); std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { } else {
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
} }
assert(thisBuf!=nullptr); assert(thisBuf!=nullptr);
} }
@ -640,7 +643,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
} }
WorldShmCommBufs[r] = thisBuf; WorldShmCommBufs[r] = thisBuf;
#else #else
WorldShmCommBufs[r] = ShmCommBuf; WorldShmCommBufs[r] = ShmCommBuf;
#endif #endif

View File

@ -276,7 +276,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
if(nt < 8)nt=8; \ if(nt < 8)nt=8; \
cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \
cgh.parallel_for<class dslash>( \ cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \ cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(8)]] \ [[intel::reqd_sub_group_size(8)]] \
@ -445,7 +445,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopySynchronize(void) {}; inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}