Best results on Aurora so far

Significantly better performance on Aurora without using pipeline mode
Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora
2026-01-10 11:59:34 +00:00 · 2025-01-31 16:14:45 +00:00 · 2025-01-30 16:36:46 +00:00 · 2025-01-29 09:22:21 +00:00 · 2025-01-28 15:22:46 +00:00 · 2025-01-28 15:22:37 +00:00
21 changed files with 807 additions and 117 deletions
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -136,7 +136,7 @@ public:
    for(int d=0;d<_ndimension;d++){
      column.resize(_processors[d]);
      column[0] = accum;
-      std::vector<CommsRequest_t> list;
+      std::vector<MpiCommsRequest_t> list;
      for(int p=1;p<_processors[d];p++){
 	ShiftedRanks(d,p,source,dest);
 	SendToRecvFromBegin(list,
@@ -166,8 +166,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<CommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  void CommsComplete(std::vector<MpiCommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 			   void *xmit,
 			   int dest,
 			   void *recv,
@@ -186,6 +186,12 @@ public:
 			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);

+  double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+				      void *xmit,
+				      int xmit_to_rank,int do_xmit,
+				      void *recv,
+				      int recv_from_rank,int do_recv,
+				      int xbytes,int rbytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -317,7 +317,7 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  assert(ierr==0);
 }

-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
@@ -342,7 +342,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(ierr==0);
  list.push_back(xrq);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
 {
  int nreq=list.size();

@@ -361,7 +361,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
+  std::vector<MpiCommsRequest_t> reqs(0);
  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  unsigned long  rcrc = crc32(0L, Z_NULL, 0);

@@ -391,12 +391,224 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

-#undef NVLINK_GET // Define to use get instead of put DMA
+
+#ifdef ACCELERATOR_AWARE_MPI
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+  return 0.0; // Do nothing -- no preparation required
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,int dox,
+							 void *recv,
+							 int from,int dor,
+							 int xbytes,int rbytes,int dir)
+{
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      list.push_back(rrq);
+      off_node_bytes+=rbytes;
+    }
+  }
+  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+_processor*32;
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      list.push_back(xrq);
+      off_node_bytes+=xbytes;
+    } else {
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      assert(shm!=NULL);
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+    }
+  }
+  return off_node_bytes;
+}
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+{
+  int nreq=list.size();
+
+  acceleratorCopySynchronise();
+
+  if (nreq==0) return;
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+  this->StencilBarrier(); 
+}
+
+#else /* NOT     ... ACCELERATOR_AWARE_MPI */
+///////////////////////////////////////////
+// Pipeline mode through host memory
+///////////////////////////////////////////
+  /*
+   * In prepare (phase 1):
+   * PHASE 1: (prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   * - post device - device transfers
+   * PHASE 3: (Complete)
+   * - MPI_waitall
+   * - host-device transfers
+   *
+   *********************************
+   * NB could split this further:
+   *--------------------------------
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   * PHASE 2: (BeginInterNode)
+   * - complete all copies 
+   * - post MPI send asynch
+   * PHASE 3: (BeginIntraNode)
+   * - post device - device transfers
+   * PHASE 4: (Complete)
+   * - MPI_waitall
+   * - host-device transfers asynch
+   * - (complete all copies) 
+   */
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int dest,int dox,
+							   void *recv,
+							   int from,int dor,
+							   int xbytes,int rbytes,int dir)
+{
+/*
+ * Bring sequence from Stencil.h down to lower level.
+ * Assume using XeLink is ok
+ */  
+  int ncomm  =communicator_halo.size();
+  int commdir=dir%ncomm;
+
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+  int gdest = ShmRanks[dest];
+  int gfrom = ShmRanks[from];
+  int gme   = ShmRanks[_processor];
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  assert(gme  == ShmRank);
+  double off_node_bytes=0.0;
+  int tag;
+
+  void * host_recv = NULL;
+  void * host_xmit = NULL;
+
+  /*
+   * PHASE 1: (Prepare)
+   * - post MPI receive buffers asynch
+   * - post device - host send buffer transfer asynch
+   */
+  
+  if ( dor ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      host_recv = this->HostBufferMalloc(rbytes);
+      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      CommsRequest_t srq;
+      srq.PacketType = InterNodeRecv;
+      srq.bytes      = rbytes;
+      srq.req        = rrq;
+      srq.host_buf   = host_recv;
+      srq.device_buf = recv;
+      list.push_back(srq);
+      off_node_bytes+=rbytes;
+    }
+  }
+  
+  if (dox) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+#undef DEVICE_TO_HOST_CONCURRENT // pipeline
+#ifdef DEVICE_TO_HOST_CONCURRENT
+      tag= dir+_processor*32;
+
+      host_xmit = this->HostBufferMalloc(xbytes);
+      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      
+      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      //      assert(ierr==0);
+      //      off_node_bytes+=xbytes;
+
+      CommsRequest_t srq;
+      srq.PacketType = InterNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = host_xmit;
+      srq.device_buf = xmit;
+      list.push_back(srq);
+#else
+      tag= dir+_processor*32;
+
+      host_xmit = this->HostBufferMalloc(xbytes);
+      const int chunks=1;
+      for(int n=0;n<chunks;n++){
+	void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
+	void * xmitc      = (void *)( (uint64_t) xmit      + n*xbytes/chunks);
+	acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
+      }
+      acceleratorCopySynchronise(); // Complete all pending copy transfers
+      
+      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      off_node_bytes+=xbytes;
+
+      CommsRequest_t srq;
+      srq.PacketType = InterNodeXmit;
+      srq.bytes      = xbytes;
+      srq.req        = xrq;
+      srq.host_buf   = host_xmit;
+      srq.device_buf = xmit;
+      list.push_back(srq);
+#endif
+    }
+  }
+
+  return off_node_bytes;
+}
+
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,int dox,
@@ -421,54 +633,86 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( dor ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      list.push_back(rrq);
-      off_node_bytes+=rbytes;
-    }
-#ifdef NVLINK_GET
-      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
-#endif
-  }
+  void * host_xmit = NULL;
+
+  ////////////////////////////////
+  // Receives already posted
+  // Copies already started
+  ////////////////////////////////
+  /*  
+   * PHASE 2: (Begin)
+   * - complete all copies
+   * - post MPI send asynch
+   */
+
+  //  static int printed;
+  //  if((printed<8) && this->IsBoss() ) {
+  //    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
+  //    printed++;
+  //  }
  
  if (dox) {
-    //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+#ifdef DEVICE_TO_HOST_CONCURRENT
      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      // Find the send in the prepared list
+      int list_idx=-1;
+      for(int idx = 0; idx<list.size();idx++){
+
+	if ( (list[idx].device_buf==xmit)
+	   &&(list[idx].PacketType==InterNodeXmit)
+	   &&(list[idx].bytes==xbytes) ) {
+
+	  list_idx = idx;
+	  host_xmit = list[idx].host_buf;
+	}
+      }
+      assert(list_idx != -1); // found it
+      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
-      list.push_back(xrq);
+      list[list_idx].req        = xrq; // Update the MPI request in the list
      off_node_bytes+=xbytes;
+#endif      
    } else {
-#ifndef NVLINK_GET
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-#endif
-      
    }
  }
-
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();

-  acceleratorCopySynchronise();
-
  if (nreq==0) return;
-
  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  std::vector<MPI_Request> MpiRequests(nreq);
+
+  for(int r=0;r<nreq;r++){
+    MpiRequests[r] = list[r].req;
+  }
+  
+  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
  assert(ierr==0);
-  list.resize(0);
+
+  for(int r=0;r<nreq;r++){
+    if ( list[r].PacketType==InterNodeRecv ) {
+      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+    }
+  }
+  
+  acceleratorCopySynchronise(); // Complete all pending copy transfers
+  list.resize(0);               // Delete the list
+  this->HostBufferFreeAll();    // Clean up the buffer allocs
+  this->StencilBarrier(); 
 }
+#endif
+////////////////////////////////////////////
+// END PIPELINE MODE / NO CUDA AWARE MPI
+////////////////////////////////////////////
+
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
+							   void *xmit,
+							   int xmit_to_rank,int dox,
+							   void *recv,
+							   int recv_from_rank,int dor,
+							   int xbytes,int rbytes, int dir)
+{
+  return xbytes+rbytes;
+}
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,int dox,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -46,8 +46,22 @@ NAMESPACE_BEGIN(Grid);

 #if defined (GRID_COMMS_MPI3) 
 typedef MPI_Comm    Grid_MPI_Comm;
+typedef MPI_Request MpiCommsRequest_t;
+#ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
+#else
+enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
+typedef struct {
+  PacketType_t PacketType;
+  void *host_buf;
+  void *device_buf;
+  unsigned long bytes;
+  MpiCommsRequest_t req;
+} CommsRequest_t;
+#endif
+
 #else 
+typedef int MpiCommsRequest_t;
 typedef int CommsRequest_t;
 typedef int Grid_MPI_Comm;
 #endif
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef ACCELERATOR_AWARE_MPI
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #define SHM_SOCKETS
+#else
+#ifdef HAVE_NUMAIF_H
+  #warning " Using NUMAIF "
+#include <numaif.h>
+#endif 
 #endif 
 #include <syscall.h>
 #endif
@@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  HostCommBuf= malloc(bytes);
+  printf("Host buffer allocate for GPU non-aware MPI\n");
+#if 0
+  HostCommBuf= acceleratorAllocHost(bytes);
+#else 
+  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
+#ifdef HAVE_NUMAIF_H
+  #warning "Moving host buffers to specific NUMA domain"
+  int numa;
+  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
+  if(numa_name) {
+    unsigned long page_size = sysconf(_SC_PAGESIZE);
+    numa = atoi(numa_name);
+    unsigned long page_count = bytes/page_size;
+    std::vector<void *> pages(page_count);
+    std::vector<int>    nodes(page_count,numa);
+    std::vector<int>    status(page_count,-1);
+    for(unsigned long p=0;p<page_count;p++){
+      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
+    }
+    int ret = move_pages(0,
+			 page_count,
+			 &pages[0],
+			 &nodes[0],
+			 &status[0],
+			 MPOL_MF_MOVE);
+    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
+    if (ret) perror(" move_pages failed for reason:");
+  }
+#endif  
+  acceleratorPin(HostCommBuf,bytes);
+#endif  
+
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -467,8 +467,8 @@ public:
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);

-    std::vector<CommsRequest_t> fwd_req;   
-    std::vector<CommsRequest_t> bwd_req;   
+    std::vector<MpiCommsRequest_t> fwd_req;   
+    std::vector<MpiCommsRequest_t> bwd_req;   

    int words = buffer_size;
    int bytes = words * sizeof(vobj);
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -119,6 +119,9 @@ public:
  void DhopOE(const FermionField &in, FermionField &out,int dag);
  void DhopEO(const FermionField &in, FermionField &out,int dag);

+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -57,6 +57,10 @@ public:
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;

+  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
  static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -332,22 +332,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
  
  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
  std::vector<std::vector<CommsRequest_t> > requests;
-  auto id=traceStart("Communicate overlapped");
-  st.CommunicateBegin(requests);

+#if 1
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
-    //  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
-    GRID_TRACE("MergeSHM");
-    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  }
-      
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif
+
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
-  //  std::cout << " WilsonFermion5D Interior " <<std::endl;
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
@@ -356,13 +352,23 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-
+  
+  //ifdef GRID_ACCELERATED
+#if 0
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
  st.CommunicateComplete(requests);
-  traceStop(id);
+  //  traceStop(id);

  /////////////////////////////
  // do the compute exterior
@@ -438,6 +444,29 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int

  DhopInternal(StencilOdd,UmuEven,in,out,dag);
 }
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
+{
+  int dag =0 ;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+  out.Checkerboard() = in.Checkerboard();
+  Compressor compressor(dag);
+  Stencil.HaloExchangeOpt(in,compressor);
+}
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int Opt = WilsonKernelsStatic::Opt;
+  Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
+}
+
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

+#ifdef GRID_SYCL
+extern "C" {
+    ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
+    uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
+    void  SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
+}
+#ifdef GRID_SIMT
+#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
+#else
+#define MAKE_ID(A) (0)
+#endif
+
+#else
+
+#define MAKE_ID(A) (0)
+
+#endif
+
+
+#define KERNEL_CALL_ID(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+      const int Nsimd = SiteHalfSpinor::Nsimd();			\
+      const int lane=acceleratorSIMTlane(Nsimd);                        \
+      int idx=sF*Nsimd+lane;						\
+      uint64_t id = MAKE_ID();						\
+      ids[idx]=id;							\
+    });									\
+  accelerator_barrier();

 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-  });
+    });

 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

@@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}

+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				     int Ls, int Nsite, const FermionField &in, FermionField &out,
+				     uint64_t *ids)
+{
+    autoView(U_v  ,  U,AcceleratorRead);
+    autoView(in_v , in,AcceleratorRead);
+    autoView(out_v,out,AcceleratorWrite);
+    autoView(st_v , st,AcceleratorRead);
+    KERNEL_CALL_ID(GenericDhopSite);
+}
  template <class Impl>
  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 					  int Ls, int Nsite, const FermionField &in, FermionField &out,
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -368,6 +368,15 @@ public:
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
+    for(int i=0;i<Packets.size();i++){
+      _grid->StencilSendToRecvFromPrepare(MpiReqs,
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,Packets[i].do_send,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,Packets[i].do_recv,
+					  Packets[i].xbytes,Packets[i].rbytes,i);
+    }
+    acceleratorCopySynchronise();
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@@ -393,8 +402,6 @@ public:
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
-    _grid->StencilBarrier(); 
-    // run any checksums
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -209,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
    }									\
  }

+inline void *acceleratorAllocHost(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMallocHost((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    assert(0);
+  }
+  return ptr;
+}
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@@ -230,8 +241,10 @@ inline void *acceleratorAllocDevice(size_t bytes)
  }
  return ptr;
 };
+
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
+inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
@@ -322,12 +335,17 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }

 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
+inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
+inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
+
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
+inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
+inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@@ -438,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
    }								\
  }

+inline void *acceleratorAllocHost(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMallocHost((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (void *) NULL;
+    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
+  }
+  return ptr;
+};
 inline void *acceleratorAllocShared(size_t bytes)
 {
  void *ptr=NULL;
@@ -461,12 +489,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

+inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-//inline void acceleratorCopySynchronise(void) {  }
+
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
@@ -483,6 +511,13 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize

 #endif

+inline void acceleratorPin(void *ptr,unsigned long bytes)
+{
+#ifdef GRID_SYCL
+  sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
+#endif
+}
+
 //////////////////////////////////////////////
 // Common on all GPU targets
 //////////////////////////////////////////////
@@ -537,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
 #ifdef HAVE_MM_MALLOC_H
+inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
 inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
 inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
 #else
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = Grid HMC benchmarks tests examples
+SUBDIRS = Grid  benchmarks tests examples HMC

 include $(top_srcdir)/doxygen.inc

--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -52,7 +52,7 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  int Ls=16;
+  int Ls=8;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/configure.ac
+++ b/configure.ac
@@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
+AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

@@ -240,6 +241,20 @@ case ${ac_SFW_FP16} in
 esac


+############### MPI BOUNCE TO HOST
+AC_ARG_ENABLE([accelerator-aware-mpi],
+    [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
+    [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
+
+# Force accelerator CSHIFT now
+AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
+
+case ${ac_ACCELERATOR_AWARE_MPI} in
+    yes)
+      AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
+    *);;
+esac
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
--- a/systems/Aurora/benchmarks/bench1.pbs
+++ b/systems/Aurora/benchmarks/bench1.pbs
@@ -1,6 +1,7 @@
 #!/bin/bash

-#PBS -q EarlyAppAccess
+##PBS -q EarlyAppAccess
+#PBS -q debug
 #PBS -l select=1
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA
@@ -12,27 +13,24 @@ source ../sourceme.sh
 cp $PBS_NODEFILE nodefile

 export OMP_NUM_THREADS=4
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
-unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
-unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+export MPICH_OFI_NIC_POLICY=GPU
+
+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
-export MPICH_OFI_NIC_POLICY=GPU
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16

 CMD="mpiexec -np 12 -ppn 12  -envall \
-	     ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals"
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "

-#for f in 1 2 3 4 5 6 7 8
-for f in 1
-do
 echo $CMD
-$CMD | tee 1node.32.32.64.48.dwf.hbm.$f
-done
+$CMD

--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -1,58 +1,48 @@
 #!/bin/bash

-#PBS -q EarlyAppAccess
+##PBS -q EarlyAppAccess
+#PBS -q debug
 #PBS -l select=2
 #PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA

-#export OMP_PROC_BIND=spread
-#unset OMP_PLACES
-
 cd $PBS_O_WORKDIR

 source ../sourceme.sh
-#module load pti-gpu
-

 cp $PBS_NODEFILE nodefile

 export OMP_NUM_THREADS=4
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+export MPICH_OFI_NIC_POLICY=GPU
+
+#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
-export MPICH_OFI_NIC_POLICY=GPU
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16

-# 12 ppn, 2 nodes, 24 ranks
 #
-CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile.sh \
-	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
-#$CMD | tee 2node.comms.hbm
+# Local vol 16.16.16.32
+#

+#VOL=32.64.64.96

-CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
-
-#for f in 1 2 3 4 5 6 7 8
-for f in 1
+for VOL in 32.32.32.96 32.64.64.96
 do
+for AT in 32
+do
+CMD="mpiexec -np 24 -ppn 12  -envall \
+	     ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
+
 echo $CMD
-$CMD | tee 2node.32.32.64.48.dwf.hbm.$f
+$CMD
+done
 done

-CMD="mpiexec -np 24 -ppn 12  -envall \
-	     ./gpu_tile.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
-		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
-#$CMD | tee 2node.64.64.64.96.dwf.hbm
-
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -4,10 +4,12 @@
 #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
 #export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)

-export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
+export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
+export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
 export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )

-export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
+export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
  
 unset EnableWalkerPartition
@@ -17,18 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero

 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1

+#export MPI_BUF_NUMA=$NUMAH
+
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "

 if [ $PALS_RANKID = "0" ]
 then
-#    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@"
-#    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
-    numactl -m $NUMA -N $NUMA  "$@"
+    numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+#    numactl -p $NUMAP -N $NUMAP  "$@"
 else 
-    numactl -m $NUMA -N $NUMA  "$@"
+    numactl -p $NUMAP -N $NUMAP  "$@"
 fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,6 +1,7 @@
 #Ahead of time compile for PVC
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
+
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"

 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
@@ -17,7 +18,7 @@ export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-co
 	--with-lime=$CLIME \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
-	--enable-accelerator-aware-mpi=yes\
+	--enable-accelerator-aware-mpi=no\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx 
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -2,6 +2,7 @@
 #module load mpich/icc-all-debug-pmix-gpu/52.2
 #module load mpich-config/mode/deterministic
 #module load intel_compute_runtime/release/821.35
+module load pti-gpu

 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
--- a/tests/Test_dwf_dslash_repro.cc
+++ b/tests/Test_dwf_dslash_repro.cc
@@ -0,0 +1,239 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_cg_prec.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+#ifndef HOST_NAME_MAX
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+#endif
+
+typedef LatticeFermionD FermionField;
+
+int VerifyOnDevice(const FermionField &res, FermionField &ref)
+{
+  deviceVector<int> Fails(1);
+  int * Fail = &Fails[0];
+  int FailHost=0;
+  
+  typedef typename FermionField::vector_object vobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  const uint64_t NN = res.Grid()->oSites();
+
+  acceleratorPut(*Fail,FailHost);
+
+  accelerator_barrier();
+  // Inject an error
+
+  int injection=0;
+  if(getenv("GRID_ERROR_INJECT")) injection=1;
+  autoView(res_v,res,AcceleratorWrite);
+  autoView(ref_v,ref,AcceleratorRead);
+  if ( res.Grid()->ThisRank()== 0 )
+  {
+    if (((random()&0xF)==0)&&injection) {
+      uint64_t sF = random()%(NN);
+      int lane=0;
+      printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
+      auto vv = acceleratorGet(res_v[sF]);
+      double *dd = (double *)&vv;
+      *dd=M_PI;
+      acceleratorPut(res_v[sF],vv);
+    }
+  }
+
+  accelerator_for( sF, NN, vobj::Nsimd(), {
+#ifdef GRID_SIMT
+      {
+        int blane = acceleratorSIMTlane(vobj::Nsimd());
+#else
+      for(int blane;blane<vobj::Nsimd();blane++){
+#endif
+	vector_type *vtrr = (vector_type *)&res_v[sF];
+	vector_type *vtrf = (vector_type *)&ref_v[sF];
+	int words = sizeof(vobj)/sizeof(vector_type);
+	
+	for(int w=0;w<words;w++){
+	  scalar_type rrtmp = getlane(vtrr[w], blane);
+	  scalar_type rftmp = getlane(vtrf[w], blane);
+	  if ( rrtmp != rftmp) {
+	      *Fail=1;
+	  }
+	}
+      }
+  });
+
+  FailHost = acceleratorGet(*Fail);
+
+  return FailHost;
+}
+void PrintFails(const FermionField &res, FermionField &ref,uint64_t *ids)
+{
+  typedef typename FermionField::vector_object vobj;
+
+  const int Nsimd=vobj::Nsimd();
+  const uint64_t NN = res.Grid()->oSites();
+
+  ///////////////////////////////
+  // Pull back to host
+  ///////////////////////////////
+  autoView(res_v,res,CpuRead);
+  autoView(ref_v,ref,CpuRead);
+  
+  std::vector<uint64_t> ids_host(NN*Nsimd);
+  
+  acceleratorCopyFromDevice(ids,&ids_host[0],NN*Nsimd*sizeof(uint64_t));
+
+  //////////////////////////////////////////////////////////////
+  // Redo check on host and print IDs
+  //////////////////////////////////////////////////////////////
+  
+  for(int ss=0;ss< NN; ss++){				
+      int sF = ss;
+      for(int lane=0;lane<Nsimd;lane++){
+	
+	auto rr = extractLane(lane,res_v[sF]);
+	auto rf = extractLane(lane,ref_v[sF]);
+	uint64_t id = ids_host[lane+Nsimd*sF];
+	//	std::cout << GridHostname()<<" id["<<sF<<"] lane "<<lane<<" id "<<id<<std::endl;
+	for(int s=0;s<4;s++){
+	  for(int c=0;c<3;c++){
+	    if ( rr()(s)(c)!=rf()(s)(c) ) {
+	      int subslice=(id>>0 )&0xFF;
+	      int slice   =(id>>8 )&0xFF;
+	      int eu      =(id>>16)&0xFF;
+	      std::cout << GridHostname()<<" miscompare site "<<sF<<" "<<rr()(s)(c)<<" "<<rf()(s)(c)<<" EU "<<eu<<" slice "<<slice<<" subslice "<<subslice<<std::endl;
+	    }
+	  }
+	}
+      }
+  };
+  return;
+}
+
+
+
+int main (int argc, char ** argv)
+{
+  char hostname[HOST_NAME_MAX+1];
+  gethostname(hostname, HOST_NAME_MAX+1);
+  std::string host(hostname);
+  
+  Grid_init(&argc,&argv);
+
+  const int Ls=12;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeGaugeField Umu(UGrid);
+  LatticeFermionD    src(FGrid); random(RNG5,src);
+  LatticeFermionD   junk(FGrid); random(RNG5,junk);
+
+  LatticeFermionD result(FGrid); result=Zero();
+  LatticeFermionD ref(FGrid); ref=Zero();
+  
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+
+  RealD mass=0.1;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  int nsecs=600;
+  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
+    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
+    GridCmdOptionInt(arg,nsecs);
+  }
+  
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
+  UGrid->Barrier();
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
+
+  std::cout << GridLogMessage << "::::::::::::: Starting DWF repro for "<<nsecs <<" seconds" << std::endl;
+
+  time_t now;
+  time_t start = time(NULL);
+  UGrid->Broadcast(0,(void *)&start,sizeof(start));
+
+  FlightRecorder::ContinueOnFail = 0;
+  FlightRecorder::PrintEntireLog = 0;
+  FlightRecorder::ChecksumComms  = 0;
+  FlightRecorder::ChecksumCommsSend=0;
+
+  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
+  if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms      = atoi(s);
+  if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend  = atoi(s);
+
+  const uint64_t NN = FGrid->oSites()*vComplexD::Nsimd();
+  
+  deviceVector<uint64_t> ids_device(NN);
+  uint64_t *ids = &ids_device[0];
+  
+
+  Ddwf.DhopComms(src,ref);
+  Ddwf.DhopCalc(src,ref,ids);
+
+  Ddwf.DhopComms(src,result);
+  
+  int iter=0;
+  do {
+    
+    result=junk;
+
+    Ddwf.DhopCalc(src,result,ids);
+
+    if ( VerifyOnDevice(result, ref) ) {
+      printf("Node %s Iter %d detected fails\n",GridHostname(),iter);
+      PrintFails(result,ref,ids);
+      //      std::cout << " Dslash "<<iter<<" is WRONG! "<<std::endl;
+    }
+    //else {
+    //      printf("Node %s Iter %d detected NO fails\n",GridHostname(),iter);
+    //      PrintFails(result,ref,ids);
+    //      std::cout << " Dslash "<<iter<<" is OK! "<<std::endl;
+    //}
+
+
+    iter ++;
+    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
+  } while (now < (start + nsecs) );
+
+  
+  Grid_finalize();
+}
Author	SHA1	Message	Date
Peter Boyle	8cf809e231	Best results on Aurora so far	2025-01-31 16:14:45 +00:00
Peter Boyle	94019a922e	Significantly better performance on Aurora without using pipeline mode	2025-01-30 16:36:46 +00:00
Peter Boyle	d6b2727f86	Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora	2025-01-29 09:22:21 +00:00
Peter Boyle	74a4f43946	Optional host buffer bounce for no CUDA aware MPI	2025-01-28 15:22:46 +00:00
Peter Boyle	1caf8b0f86	Rename	2025-01-28 15:22:37 +00:00
Peter Boyle	8fe429346f	Dslash testing for reproduce	2024-11-11 23:11:11 +00:00