Significantly better performance on Aurora without using pipeline mode

2025-08-14 18:21:54 +01:00 · 2025-01-30 16:36:46 +00:00
parent d6b2727f86
commit 94019a922e
12 changed files with 306 additions and 101 deletions
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -186,6 +186,12 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);

+  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+				      void *xmit,
+				      int xmit_to_rank,int do_xmit,
+				      void *recv,
+				      int recv_from_rank,int do_recv,
+				      int xbytes,int rbytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -391,42 +391,131 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

-#undef NVLINK_GET // Define to use get instead of put DMA
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+
+#ifdef ACCELERATOR_AWARE_MPI
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+  return 0.0; // Do nothing -- no preparation required
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(int list_idx,
+							 std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
 							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
+{
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      list.push_back(rrq);
+      off_node_bytes+=rbytes;
+    }
+  }
+  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+_processor*32;
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      list.push_back(xrq);
+      off_node_bytes+=xbytes;
+    } else {
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+    }
+  }
+  return off_node_bytes;
+}
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+{
+  int nreq=list.size();
+
+  acceleratorCopySynchronise();
+
+  if (nreq==0) return;
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+}
+
+#else /* NOT     ... ACCELERATOR_AWARE_MPI */
+///////////////////////////////////////////
+// Pipeline mode through host memory
+///////////////////////////////////////////
+  /*
+   * In prepare (phase 1):
+   * PHASE 1: (prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * - post device - device transfers
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   * PHASE 3: (Complete)
+   * - MPI_waitall
+   * - host-device transfers
+   *
+   *********************************
+   * NB could split this further:
+   *--------------------------------
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (BeginInterNode)
+   * - complete all copies 
+   * - post MPI send asynch
+   * PHASE 3: (BeginIntraNode)
+   * - post device - device transfers
+   * PHASE 4: (Complete)
+   * - MPI_waitall
+   * - host-device transfers asynch
+   * - (complete all copies) 
+   */
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
 {
 /*
 * Bring sequence from Stencil.h down to lower level.
 * Assume using XeLink is ok
-#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
-      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
-      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
-      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
-      if ( Packets[i].do_send ) {
-	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
-      }
-      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].host_send_buf,
-					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].host_recv_buf,
-					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
-    }
-    for(int i=0;i<Packets.size();i++){
-      if ( Packets[i].do_recv ) {
-      }
-    }
-    _grid->HostBufferFreeAll();
-*/  
+ */  
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;

@@ -447,14 +536,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  void * host_recv = NULL;
  void * host_xmit = NULL;

+  /*
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   */
+  
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-#ifdef ACCELERATOR_AWARE_MPI
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      list.push_back(rrq);
-#else
      host_recv = this->HostBufferMalloc(rbytes);
      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
@@ -465,79 +555,137 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      srq.host_buf   = host_recv;
      srq.device_buf = recv;
      list.push_back(srq);
-#endif
      off_node_bytes+=rbytes;
-    } else{ 
-#ifdef NVLINK_GET
-      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-#endif
    }
  }
  
  if (dox) {
-    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-#ifdef ACCELERATOR_AWARE_MPI
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list.push_back(xrq);
-#else
-      std::cout << " send via host bounce "<<std::endl;
+
      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDevice(xmit, host_xmit,xbytes);
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
+      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      
+      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      //      assert(ierr==0);
+      //      off_node_bytes+=xbytes;
+
      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
-      srq.req        = xrq;
+      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      list.push_back(srq);
-#endif
-      off_node_bytes+=xbytes;
+
    } else {
-#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
-      
    }
  }

  return off_node_bytes;
 }
+
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,int dox,
+							 void *recv,
+							 int from,int dor,
+							 int xbytes,int rbytes,int dir)
+{
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+
+  void * host_xmit = NULL;
+
+  ////////////////////////////////
+  // Receives already posted
+  // Copies already started
+  ////////////////////////////////
+  /*  
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   */
+  acceleratorCopySynchronise();
+
+  static int printed;
+  if(!printed && this->IsBoss() ) {
+    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
+    printed=1;
+  }
+  
+  if (dox) {
+
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+_processor*32;
+      // Find the send in the prepared list
+      int list_idx=-1;
+      for(int idx = 0; idx<list.size();idx++){
+
+	if ( (list[idx].device_buf==xmit)
+	   &&(list[idx].PacketType==InterNodeXmit)
+	   &&(list[idx].bytes==xbytes) ) {
+
+	  list_idx = idx;
+	  host_xmit = list[idx].host_buf;
+	}
+      }
+      assert(list_idx != -1); // found it
+      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      list[list_idx].req        = xrq; // Update the MPI request in the list
+      off_node_bytes+=xbytes;
+    } 
+  }
+  return off_node_bytes;
+}
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();

-  acceleratorCopySynchronise();
-
  if (nreq==0) return;
-#ifdef ACCELERATOR_AWARE_MPI
  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
-#else
-  // Wait individually and immediately copy receives to device
-  // Promition to Asynch copy and single wait is easy
-  MPI_Status status;
+  std::vector<MPI_Request> MpiRequests(nreq);
+
+  for(int r=0;r<nreq;r++){
+    MpiRequests[r] = list[r].req;
+  }
+  
+  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
+  assert(ierr==0);
+
  for(int r=0;r<nreq;r++){
-    int ierr = MPI_Wait(&list[r].req,&status);
-    assert(ierr==0);
    if ( list[r].PacketType==InterNodeRecv ) {
-      acceleratorCopyToDevice(list[r].host_buf,list[r].device_buf,list[r].bytes);
+      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
    }
  }
-  list.resize(0);
-  this->HostBufferFreeAll();
-#endif
+  
+  acceleratorCopySynchronise(); // Complete all pending copy transfers
+  list.resize(0);               // Delete the list
+  this->HostBufferFreeAll();    // Clean up the buffer allocs
 }
+#endif
+////////////////////////////////////////////
+// END PIPELINE MODE / NO CUDA AWARE MPI
+////////////////////////////////////////////
+
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int xmit_to_rank,int dox,
+							   void *recv,
+							   int recv_from_rank,int dor,
+							   int xbytes,int rbytes, int dir)
+{
+  return xbytes+rbytes;
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -43,8 +43,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
 #else
-#undef NUMA_PLACE_HOSTBUF
-#ifdef NUMA_PLACE_HOSTBUF
+#ifdef HAVE_NUMAIF_H
+  #warning " Using NUMAIF "
 #include <numaif.h>
 #endif 
 #endif 
@@ -544,18 +544,19 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifndef ACCELERATOR_AWARE_MPI
  printf("Host buffer allocate for GPU non-aware MPI\n");
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#ifdef NUMA_PLACE_HOSTBUF
+#ifdef HAVE_NUMAIF_H
+  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
  if(numa_name) {
-    page_size = sysconf(_SC_PAGESIZE);
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
    numa = atoi(numa_name);
    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(pcount);
-    std::vector<int>    nodes(pcount,numa);
-    std::vector<int>    status(pcount,-1);
+    std::vector<void *> pages(page_count);
+    std::vector<int>    nodes(page_count,numa);
+    std::vector<int>    status(page_count,-1);
    for(unsigned long p=0;p<page_count;p++){
-      pages[p] = HostCommBuf + p*page_size;
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
    }
    int ret = move_pages(0,
 			 page_count,
@@ -565,7 +566,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 			 MPOL_MF_MOVE);
    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
    if (ret) perror(" move_pages failed for reason:");
+  }
 #endif  
+  acceleratorPin(HostCommBuf,bytes);
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -332,22 +332,17 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  
  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
-  auto id=traceStart("Communicate overlapped");
-  st.CommunicateBegin(requests);
-
+#ifndef GRID_ACCELERATED
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
-    //  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
-    GRID_TRACE("MergeSHM");
-    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  }
-      
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif
+
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
-  //  std::cout << " WilsonFermion5D Interior " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
@@ -356,13 +351,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-
+  
+#ifdef GRID_ACCELERATED
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
-  traceStop(id);
+  //  traceStop(id);

  /////////////////////////////
  // do the compute exterior
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -368,6 +368,14 @@ public:
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
+    for(int i=0;i<Packets.size();i++){
+      _grid->StencilSendToRecvFromPrepare(MpiReqs,
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,Packets[i].do_send,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,Packets[i].do_recv,
+					  Packets[i].xbytes,Packets[i].rbytes,i);
+    }
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -327,7 +327,10 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
+
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
+inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
+inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@@ -465,8 +468,7 @@ inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-//inline void acceleratorCopySynchronise(void) {  }
+
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
@@ -483,6 +485,13 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize

 #endif

+inline void acceleratorPin(void *ptr,unsigned long bytes)
+{
+#ifdef GRID_SYCL
+  sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
+#endif
+}
+
 //////////////////////////////////////////////
 // Common on all GPU targets
 //////////////////////////////////////////////