From d6b2727f86872e995a03f6a774f6cfbfd8e55e41 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 29 Jan 2025 09:22:21 +0000 Subject: [PATCH] Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora --- Grid/communicator/Communicator_base.h | 6 +- Grid/communicator/Communicator_mpi3.cc | 79 ++++++++++++++++++++++++-- Grid/communicator/SharedMemory.h | 14 +++++ Grid/communicator/SharedMemoryMPI.cc | 2 +- Grid/lattice/PaddedCell.h | 4 +- Grid/stencil/Stencil.h | 27 --------- systems/Aurora/benchmarks/bench1.pbs | 2 +- systems/Aurora/benchmarks/bench2.pbs | 52 +++++------------ systems/Aurora/benchmarks/gpu_tile.sh | 4 +- 9 files changed, 112 insertions(+), 78 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 3f38edd3..0da7dc22 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -136,7 +136,7 @@ public: for(int d=0;d<_ndimension;d++){ column.resize(_processors[d]); column[0] = accum; - std::vector list; + std::vector list; for(int p=1;p<_processors[d];p++){ ShiftedRanks(d,p,source,dest); SendToRecvFromBegin(list, @@ -166,8 +166,8 @@ public: //////////////////////////////////////////////////////////// // Face exchange, buffer swap in translational invariant way //////////////////////////////////////////////////////////// - void CommsComplete(std::vector &list); - void SendToRecvFromBegin(std::vector &list, + void CommsComplete(std::vector &list); + void SendToRecvFromBegin(std::vector &list, void *xmit, int dest, void *recv, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 192bb339..d269f933 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -317,7 +317,7 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N) assert(ierr==0); } -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, void *recv, @@ -342,7 +342,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis assert(ierr==0); list.push_back(xrq); } -void CartesianCommunicator::CommsComplete(std::vector &list) +void CartesianCommunicator::CommsComplete(std::vector &list) { int nreq=list.size(); @@ -361,7 +361,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int from, int bytes) { - std::vector reqs(0); + std::vector reqs(0); unsigned long xcrc = crc32(0L, Z_NULL, 0); unsigned long rcrc = crc32(0L, Z_NULL, 0); @@ -404,6 +404,29 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } + for(int i=0;iHostBufferFreeAll(); +*/ int ncomm =communicator_halo.size(); int commdir=dir%ncomm; @@ -421,28 +444,60 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorHostBufferMalloc(rbytes); + ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); + CommsRequest_t srq; + srq.PacketType = InterNodeRecv; + srq.bytes = rbytes; + srq.req = rrq; + srq.host_buf = host_recv; + srq.device_buf = recv; + list.push_back(srq); +#endif off_node_bytes+=rbytes; - } + } else{ #ifdef NVLINK_GET void *shm = (void *) this->ShmBufferTranslate(from,xmit); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); #endif + } } if (dox) { // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32; +#ifdef ACCELERATOR_AWARE_MPI ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); assert(ierr==0); list.push_back(xrq); +#else + std::cout << " send via host bounce "<HostBufferMalloc(xbytes); + acceleratorCopyFromDevice(xmit, host_xmit,xbytes); + ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + assert(ierr==0); + CommsRequest_t srq; + srq.PacketType = InterNodeXmit; + srq.bytes = xbytes; + srq.req = xrq; + srq.host_buf = host_xmit; + srq.device_buf = xmit; + list.push_back(srq); +#endif off_node_bytes+=xbytes; } else { #ifndef NVLINK_GET @@ -463,11 +518,25 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); +#else + // Wait individually and immediately copy receives to device + // Promition to Asynch copy and single wait is easy + MPI_Status status; + for(int r=0;rHostBufferFreeAll(); +#endif } void CartesianCommunicator::StencilBarrier(void) { diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 94e9741e..422be8aa 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -46,8 +46,22 @@ NAMESPACE_BEGIN(Grid); #if defined (GRID_COMMS_MPI3) typedef MPI_Comm Grid_MPI_Comm; +typedef MPI_Request MpiCommsRequest_t; +#ifdef ACCELERATOR_AWARE_MPI typedef MPI_Request CommsRequest_t; +#else +enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv }; +typedef struct { + PacketType_t PacketType; + void *host_buf; + void *device_buf; + unsigned long bytes; + MpiCommsRequest_t req; +} CommsRequest_t; +#endif + #else +typedef int MpiCommsRequest_t; typedef int CommsRequest_t; typedef int Grid_MPI_Comm; #endif diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 2642c0bd..c7668f8b 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -543,7 +543,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI printf("Host buffer allocate for GPU non-aware MPI\n"); - HostCommBuf= malloc(bytes); + HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host #ifdef NUMA_PLACE_HOSTBUF int numa; char *numa_name=(char *)getenv("MPI_BUF_NUMA"); diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index c7dcbac9..fb533212 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -467,8 +467,8 @@ public: send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); - std::vector fwd_req; - std::vector bwd_req; + std::vector fwd_req; + std::vector bwd_req; int words = buffer_size; int bytes = words * sizeof(vobj); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index a768f344..2a478d13 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -368,7 +368,6 @@ public: // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. -#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -377,23 +376,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } -#else -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } -#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iHostBufferFreeAll(); -#endif // run any checksums _grid->StencilBarrier(); for(int i=0;i