mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-13 01:05:36 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
commit
cffc736bb3
@ -513,26 +513,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
|
||||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
|
|
||||||
auto zeContext= cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
|
|
||||||
ze_device_mem_alloc_desc_t zeDesc = {};
|
|
||||||
zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf);
|
|
||||||
std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes
|
|
||||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
|
||||||
#else
|
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
#endif
|
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
// if ( WorldRank == 0 ){
|
if ( WorldRank == 0 ){
|
||||||
if ( 1 ){
|
|
||||||
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
}
|
}
|
||||||
// SharedMemoryZero(ShmCommBuf,bytes);
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
std::cout<< "Setting up IPC"<<std::endl;
|
std::cout<< "Setting up IPC"<<std::endl;
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Loop over ranks/gpu's on our node
|
// Loop over ranks/gpu's on our node
|
||||||
@ -546,20 +536,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
void * thisBuf = ShmCommBuf;
|
void * thisBuf = ShmCommBuf;
|
||||||
if(!Stencil_force_mpi) {
|
if(!Stencil_force_mpi) {
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
ze_ipc_mem_handle_t handle;
|
typedef struct { int fd; pid_t pid ; } clone_mem_t;
|
||||||
|
|
||||||
|
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
|
||||||
|
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
|
||||||
|
|
||||||
|
ze_ipc_mem_handle_t ihandle;
|
||||||
|
clone_mem_t handle;
|
||||||
|
|
||||||
if ( r==WorldShmRank ) {
|
if ( r==WorldShmRank ) {
|
||||||
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle);
|
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
|
||||||
if ( err != ZE_RESULT_SUCCESS ) {
|
if ( err != ZE_RESULT_SUCCESS ) {
|
||||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||||
}
|
}
|
||||||
std::cerr<<"Allocated IpcHandle rank "<<r<<" (hex) ";
|
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||||
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
|
handle.pid = getpid();
|
||||||
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
|
|
||||||
}
|
|
||||||
std::cerr<<std::endl;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
@ -602,18 +596,27 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
if ( r!=WorldShmRank ) {
|
if ( r!=WorldShmRank ) {
|
||||||
thisBuf = nullptr;
|
thisBuf = nullptr;
|
||||||
std::cerr<<"Using IpcHandle rank "<<r<<" ";
|
std::cout<<"mapping seeking remote pid/fd "
|
||||||
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
|
<<handle.pid<<"/"
|
||||||
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
|
<<handle.fd<<std::endl;
|
||||||
}
|
|
||||||
std::cerr<<std::endl;
|
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||||
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,handle,0,&thisBuf);
|
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||||
|
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||||
|
int myfd = syscall(438,pidfd,handle.fd,0);
|
||||||
|
|
||||||
|
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
|
||||||
|
|
||||||
|
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||||
|
|
||||||
|
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
|
||||||
if ( err != ZE_RESULT_SUCCESS ) {
|
if ( err != ZE_RESULT_SUCCESS ) {
|
||||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
||||||
|
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
|
||||||
}
|
}
|
||||||
assert(thisBuf!=nullptr);
|
assert(thisBuf!=nullptr);
|
||||||
}
|
}
|
||||||
@ -640,7 +643,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Save a copy of the device buffers
|
// Save a copy of the device buffers
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
}
|
}
|
||||||
WorldShmCommBufs[r] = thisBuf;
|
WorldShmCommBufs[r] = thisBuf;
|
||||||
#else
|
#else
|
||||||
WorldShmCommBufs[r] = ShmCommBuf;
|
WorldShmCommBufs[r] = ShmCommBuf;
|
||||||
#endif
|
#endif
|
||||||
|
@ -276,7 +276,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
if(nt < 8)nt=8; \
|
if(nt < 8)nt=8; \
|
||||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
cl::sycl::range<3> local {nt,1,nsimd}; \
|
||||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
||||||
cgh.parallel_for<class dslash>( \
|
cgh.parallel_for( \
|
||||||
cl::sycl::nd_range<3>(global,local), \
|
cl::sycl::nd_range<3>(global,local), \
|
||||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||||
[[intel::reqd_sub_group_size(8)]] \
|
[[intel::reqd_sub_group_size(8)]] \
|
||||||
@ -445,7 +445,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
|
|||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||||
inline void acceleratorCopySynchronize(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||||
|
Loading…
Reference in New Issue
Block a user