Significantly better performance on Aurora without using pipeline mode

2025-10-30 11:34:32 +00:00 · 2025-01-30 16:36:46 +00:00
parent d6b2727f86
commit 94019a922e
12 changed files with 306 additions and 101 deletions
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -186,6 +186,12 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);
  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 				      void *xmit,
 				      int xmit_to_rank,int do_xmit,
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -391,13 +391,121 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-#undef NVLINK_GET // Define to use get instead of put DMA
+
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+#ifdef ACCELERATOR_AWARE_MPI
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
 							   void *recv,
 							   int from,int dor,
 							   int xbytes,int rbytes,int dir)
 {
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(int list_idx,
 							 std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
 							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  int tag;
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
      off_node_bytes+=rbytes;
    }
  }
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
      off_node_bytes+=xbytes;
    } else {
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
    }
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
  acceleratorCopySynchronise();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
 }
 #else /* NOT     ... ACCELERATOR_AWARE_MPI */
 ///////////////////////////////////////////
 // Pipeline mode through host memory
 ///////////////////////////////////////////
  /*
   * In prepare (phase 1):
   * PHASE 1: (prepare)
   * - post MPI receive buffers asynch
   * - post device - host send buffer transfer asynch
   * - post device - device transfers
   * PHASE 2: (Begin)
   * - complete all copies
   * - post MPI send asynch
   * PHASE 3: (Complete)
   * - MPI_waitall
   * - host-device transfers
   *
   *********************************
   * NB could split this further:
   *--------------------------------
   * PHASE 1: (Prepare)
   * - post MPI receive buffers asynch
   * - post device - host send buffer transfer asynch
   * PHASE 2: (BeginInterNode)
   * - complete all copies 
   * - post MPI send asynch
   * PHASE 3: (BeginIntraNode)
   * - post device - device transfers
   * PHASE 4: (Complete)
   * - MPI_waitall
   * - host-device transfers asynch
   * - (complete all copies) 
   */
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
 							   void *recv,
@@ -407,25 +515,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 /*
 * Bring sequence from Stencil.h down to lower level.
 * Assume using XeLink is ok
 #warning "Using COPY VIA HOST BUFFERS IN STENCIL"
      // Introduce a host buffer with a cheap slab allocator and zero cost wipe all
      Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
      Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
      if ( Packets[i].do_send ) {
 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
      }
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].host_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].host_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv ) {
      }
    }
    _grid->HostBufferFreeAll();
 */  
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
@@ -447,14 +536,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  void * host_recv = NULL;
  void * host_xmit = NULL;
  /*
   * PHASE 1: (Prepare)
   * - post MPI receive buffers asynch
   * - post device - host send buffer transfer asynch
   */
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
 #ifdef ACCELERATOR_AWARE_MPI
      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
 #else
      host_recv = this->HostBufferMalloc(rbytes);
      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
@@ -465,79 +555,137 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      srq.host_buf   = host_recv;
      srq.device_buf = recv;
      list.push_back(srq);
 #endif
      off_node_bytes+=rbytes;
    } else{ 
 #ifdef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
 #endif
    }
  }
  if (dox) {
    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-#ifdef ACCELERATOR_AWARE_MPI
+
      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
 #else
      std::cout << " send via host bounce "<<std::endl;
      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDevice(xmit, host_xmit,xbytes);
+      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      
-      assert(ierr==0);
+      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;
      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
-      srq.req        = xrq;
+      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
      list.push_back(srq);
-#endif
+
      off_node_bytes+=xbytes;
    } else {
 #ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 #endif
    }
  }
  return off_node_bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
 							 void *recv,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  int tag;
  void * host_xmit = NULL;
  ////////////////////////////////
  // Receives already posted
  // Copies already started
  ////////////////////////////////
  /*  
   * PHASE 2: (Begin)
   * - complete all copies
   * - post MPI send asynch
   */
  acceleratorCopySynchronise();
  static int printed;
  if(!printed && this->IsBoss() ) {
    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
    printed=1;
  }
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      // Find the send in the prepared list
      int list_idx=-1;
      for(int idx = 0; idx<list.size();idx++){
 	if ( (list[idx].device_buf==xmit)
 	   &&(list[idx].PacketType==InterNodeXmit)
 	   &&(list[idx].bytes==xbytes) ) {
 	  list_idx = idx;
 	  host_xmit = list[idx].host_buf;
 	}
      }
      assert(list_idx != -1); // found it
      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list[list_idx].req        = xrq; // Update the MPI request in the list
      off_node_bytes+=xbytes;
    } 
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();
  acceleratorCopySynchronise();
  if (nreq==0) return;
 #ifdef ACCELERATOR_AWARE_MPI
  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  std::vector<MPI_Request> MpiRequests(nreq);
-  assert(ierr==0);
+
  list.resize(0);
 #else
  // Wait individually and immediately copy receives to device
  // Promition to Asynch copy and single wait is easy
  MPI_Status status;
  for(int r=0;r<nreq;r++){
-    int ierr = MPI_Wait(&list[r].req,&status);
+    MpiRequests[r] = list[r].req;
  }
  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
  assert(ierr==0);
  for(int r=0;r<nreq;r++){
    if ( list[r].PacketType==InterNodeRecv ) {
-      acceleratorCopyToDevice(list[r].host_buf,list[r].device_buf,list[r].bytes);
+      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
    }
  }
-  list.resize(0);
+  
-  this->HostBufferFreeAll();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
 }
 #endif
-}
+////////////////////////////////////////////
 // END PIPELINE MODE / NO CUDA AWARE MPI
 ////////////////////////////////////////////
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
 							   void *recv,
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
  return xbytes+rbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -43,8 +43,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
 #else
-#undef NUMA_PLACE_HOSTBUF
+#ifdef HAVE_NUMAIF_H
-#ifdef NUMA_PLACE_HOSTBUF
+  #warning " Using NUMAIF "
 #include <numaif.h>
 #endif 
 #endif 
@@ -544,18 +544,19 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifndef ACCELERATOR_AWARE_MPI
  printf("Host buffer allocate for GPU non-aware MPI\n");
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#ifdef NUMA_PLACE_HOSTBUF
+#ifdef HAVE_NUMAIF_H
  #warning "Moving host buffers to specific NUMA domain"
  int numa;
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
  if(numa_name) {
-    page_size = sysconf(_SC_PAGESIZE);
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
    numa = atoi(numa_name);
    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(pcount);
+    std::vector<void *> pages(page_count);
-    std::vector<int>    nodes(pcount,numa);
+    std::vector<int>    nodes(page_count,numa);
-    std::vector<int>    status(pcount,-1);
+    std::vector<int>    status(page_count,-1);
    for(unsigned long p=0;p<page_count;p++){
-      pages[p] = HostCommBuf + p*page_size;
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
    }
    int ret = move_pages(0,
 			 page_count,
@@ -565,7 +566,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 			 MPOL_MF_MOVE);
    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
    if (ret) perror(" move_pages failed for reason:");
  }
 #endif  
  acceleratorPin(HostCommBuf,bytes);
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -332,22 +332,17 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
-  auto id=traceStart("Communicate overlapped");
+#ifndef GRID_ACCELERATED
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
+  st.CommunicateBegin(requests);
    //  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
    GRID_TRACE("MergeSHM");
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
-  }
+#endif
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Interior " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
@@ -357,12 +352,21 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
 #ifdef GRID_ACCELERATED
  /////////////////////////////
  // Overlap with comms -- on GPU the interior kernel call is nonblocking
  /////////////////////////////
  st.CommunicateBegin(requests);
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 #endif
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
-  traceStop(id);
+  //  traceStop(id);
  /////////////////////////////
  // do the compute exterior
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -368,6 +368,14 @@ public:
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
 					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -327,7 +327,10 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
 inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
 inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@@ -465,8 +468,7 @@ inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+
 //inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
@@ -483,6 +485,13 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
 #endif
 inline void acceleratorPin(void *ptr,unsigned long bytes)
 {
 #ifdef GRID_SYCL
  sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
 #endif
 }
 //////////////////////////////////////////////
 // Common on all GPU targets
 //////////////////////////////////////////////
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = Grid HMC benchmarks tests examples
+SUBDIRS = Grid  benchmarks tests examples HMC
 include $(top_srcdir)/doxygen.inc
--- a/configure.ac
+++ b/configure.ac
@@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@@ -245,9 +246,11 @@ AC_ARG_ENABLE([accelerator-aware-mpi],
    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
 # Force accelerator CSHIFT now
 AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
 case ${ac_ACCELERATOR_AWARE_MPI} in
    yes)
      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
    *);;
 esac
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -27,10 +27,22 @@ export MPICH_OFI_NIC_POLICY=GPU
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 #
 # Local vol 16.16.16.32
 #
 #VOL=32.64.64.96
 for VOL in 32.32.32.96 32.64.64.96
 do
 for AT in 32
 do
 CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
 echo $CMD
 $CMD
 done
 done
--- a/systems/Aurora/benchmarks/gpu_tile.sh
+++ b/systems/Aurora/benchmarks/gpu_tile.sh
@@ -5,11 +5,11 @@
 #export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
 export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
-export NUMA_MMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
+export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
 export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
 export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
-export NUMAM=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
+export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 unset EnableWalkerPartition
@@ -19,17 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 export MPI_BUF_NUMA=$NUMAH
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 if [ $PALS_RANKID = "0" ]
 then
-    numactl -m $NUMAM -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
-#    numactl -m $NUMAM -N $NUMAP  "$@"
+#    numactl -p $NUMAP -N $NUMAP  "$@"
 else 
-    numactl -m $NUMAM -N $NUMAP  "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,6 +1,7 @@
 #Ahead of time compile for PVC
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl "