diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ec6a5003..2642c0bd 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -42,6 +42,11 @@ Author: Christoph Lehner #ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS +#else +#undef NUMA_PLACE_HOSTBUF +#ifdef NUMA_PLACE_HOSTBUF +#include +#endif #endif #include #endif @@ -537,7 +542,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI + printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes); +#ifdef NUMA_PLACE_HOSTBUF + int numa; + char *numa_name=(char *)getenv("MPI_BUF_NUMA"); + if(numa_name) { + page_size = sysconf(_SC_PAGESIZE); + numa = atoi(numa_name); + unsigned long page_count = bytes/page_size; + std::vector pages(pcount); + std::vector nodes(pcount,numa); + std::vector status(pcount,-1); + for(unsigned long p=0;pStencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. +#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -376,6 +377,23 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } +#else +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } +#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iHostBufferFreeAll(); +#endif // run any checksums _grid->StencilBarrier(); - // run any checksums for(int i=0;i