From 74a4f4394690dc872afb1f93e3d49c97a35f46f0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Jan 2025 15:22:46 +0000 Subject: [PATCH] Optional host buffer bounce for no CUDA aware MPI --- Grid/communicator/SharedMemoryMPI.cc | 28 +++++++++++++++ Grid/stencil/Stencil.h | 28 ++++++++++++++- configure.ac | 12 +++++++ systems/Aurora/benchmarks/bench1.pbs | 32 ++++++++--------- systems/Aurora/benchmarks/gpu_tile_compact.sh | 34 ------------------- systems/Aurora/config-command | 2 +- systems/Aurora/sourceme.sh | 1 + 7 files changed, 84 insertions(+), 53 deletions(-) delete mode 100755 systems/Aurora/benchmarks/gpu_tile_compact.sh diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ec6a5003..2642c0bd 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -42,6 +42,11 @@ Author: Christoph Lehner #ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS +#else +#undef NUMA_PLACE_HOSTBUF +#ifdef NUMA_PLACE_HOSTBUF +#include +#endif #endif #include #endif @@ -537,7 +542,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI + printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes); +#ifdef NUMA_PLACE_HOSTBUF + int numa; + char *numa_name=(char *)getenv("MPI_BUF_NUMA"); + if(numa_name) { + page_size = sysconf(_SC_PAGESIZE); + numa = atoi(numa_name); + unsigned long page_count = bytes/page_size; + std::vector pages(pcount); + std::vector nodes(pcount,numa); + std::vector status(pcount,-1); + for(unsigned long p=0;pStencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. +#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -376,6 +377,23 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } +#else +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } +#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iHostBufferFreeAll(); +#endif // run any checksums _grid->StencilBarrier(); - // run any checksums for(int i=0;i