From 5bfa88be85b454e09ed82a8361f4cdf6846e7823 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 6 Feb 2024 16:28:40 +0000
Subject: [PATCH] Aurora MPI standalone benchmake and options that work well

---
 MPI_benchmark/bench2.pbs                      |  22 ++
 MPI_benchmark/compile-command                 |   1 +
 MPI_benchmark/gpu_tile_compact.sh             |  30 ++
 MPI_benchmark/halo_mpi.cc                     | 333 ++++++++++++++++++
 systems/Aurora/benchmarks/bench.pbs           |   7 +-
 systems/Aurora/benchmarks/bench2.pbs          |  44 +--
 systems/Aurora/benchmarks/gpu_tile_compact.sh |  62 +---
 .../Aurora/benchmarks/gpu_tile_compact4.sh    |  43 +--
 systems/Aurora/sourceme.sh                    |   2 +-
 9 files changed, 426 insertions(+), 118 deletions(-)
 create mode 100644 MPI_benchmark/bench2.pbs
 create mode 100644 MPI_benchmark/compile-command
 create mode 100755 MPI_benchmark/gpu_tile_compact.sh
 create mode 100644 MPI_benchmark/halo_mpi.cc
diff --git a/MPI_benchmark/bench2.pbs b/MPI_benchmark/bench2.pbs
new file mode 100644
index 00000000..2c069a20
--- /dev/null
+++ b/MPI_benchmark/bench2.pbs
@@ -0,0 +1,22 @@
+#!/bin/bash
+#PBS -q EarlyAppAccess
+#PBS -l select=2
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+export TZ='/usr/share/zoneinfo/US/Central'
+export OMP_PROC_BIND=spread
+export OMP_NUM_THREADS=3
+unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS=12         # Number of MPI ranks per node
+NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
+
+NTOTRANKS=$(( NNODES * NRANKS ))
+
+CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
+$CMD
diff --git a/MPI_benchmark/compile-command b/MPI_benchmark/compile-command
new file mode 100644
index 00000000..20f26a3c
--- /dev/null
+++ b/MPI_benchmark/compile-command
@@ -0,0 +1 @@
+mpicxx  -fsycl halo_mpi.cc -o halo_mpi
\ No newline at end of file
diff --git a/MPI_benchmark/gpu_tile_compact.sh b/MPI_benchmark/gpu_tile_compact.sh
new file mode 100755
index 00000000..28fdb341
--- /dev/null
+++ b/MPI_benchmark/gpu_tile_compact.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
+export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
+export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
+
+export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
+export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
+export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
+  
+
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+
+#unset EnableWalkerPartition
+#export EnableImplicitScaling=0
+#export GRID_MPICH_NIC_BIND=$NIC
+#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
+#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
+
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
+
+numactl -m $PNUMA -N $NUMA  "$@"
diff --git a/MPI_benchmark/halo_mpi.cc b/MPI_benchmark/halo_mpi.cc
new file mode 100644
index 00000000..9e11c473
--- /dev/null
+++ b/MPI_benchmark/halo_mpi.cc
@@ -0,0 +1,333 @@
+#include <cassert>
+#include <complex>
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include <array>
+#include <string>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <ctime>
+#include <sys/time.h>
+
+#include <mpi.h>
+
+/**************************************************************
+ * GPU - GPU memory cartesian halo exchange benchmark
+ * Config: what is the target
+ **************************************************************
+ */
+#undef ACC_CUDA
+#undef  ACC_HIP
+#define  ACC_SYCL
+#undef  ACC_NONE
+
+/**************************************************************
+ * Some MPI globals
+ **************************************************************
+ */
+MPI_Comm WorldComm;
+MPI_Comm WorldShmComm;
+
+int WorldSize;
+int WorldRank;
+
+int WorldShmSize;
+int WorldShmRank;
+
+/**************************************************************
+ * Allocate buffers on the GPU, SYCL needs an init call and context
+ **************************************************************
+ */
+#ifdef ACC_CUDA
+#include <cuda.h>
+void acceleratorInit(void){}
+void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = cudaMalloc((void **)&ptr,bytes);
+  assert(err==cudaSuccess);
+  return ptr;
+}
+void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
+#endif
+#ifdef ACC_HIP
+#include <hip/hip_runtime.h>
+void acceleratorInit(void){}
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  void *ptr=NULL;
+  auto err = hipMalloc((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (void *) NULL;
+    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
+  }
+  return ptr;
+};
+inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
+#endif
+#ifdef ACC_SYCL
+#include <sycl/CL/sycl.hpp>
+#include <sycl/usm.hpp>
+cl::sycl::queue *theAccelerator;
+void acceleratorInit(void)
+{
+  int nDevices = 1;
+#if 1
+  cl::sycl::gpu_selector selector;
+  cl::sycl::device selectedDevice { selector };
+  theAccelerator = new sycl::queue (selectedDevice);
+#else
+  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
+  theAccelerator = new sycl::queue (selectedDevice);
+#endif
+  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
+  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
+}
+inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
+#endif
+#ifdef ACC_NONE
+void acceleratorInit(void){}
+inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
+inline void acceleratorFreeDevice(void *ptr){free(ptr);};
+#endif
+
+
+/**************************************************************
+ * Microsecond timer
+ **************************************************************
+ */
+inline double usecond(void) {
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
+}
+/**************************************************************
+ * Main benchmark routine
+ **************************************************************
+ */
+void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
+{
+  int64_t words = 3*4*2;
+  int64_t face,vol;
+  int Nd=cart_geom.size();
+  
+  /**************************************************************
+   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
+   * Allocate memory for these
+   **************************************************************
+   */
+  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
+  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
+
+  
+  std::vector<void *> send_bufs;
+  std::vector<void *> recv_bufs;
+  size_t vw = face*words;
+  size_t bytes = face*words*sizeof(double);
+
+  if ( use_device ) {
+    for(int d=0;d<2*Nd;d++){
+      send_bufs.push_back(acceleratorAllocDevice(bytes));
+      recv_bufs.push_back(acceleratorAllocDevice(bytes));
+    }
+  } else {
+    for(int d=0;d<2*Nd;d++){
+      send_bufs.push_back(malloc(bytes));
+      recv_bufs.push_back(malloc(bytes));
+    }
+  }
+  /*********************************************************
+   * Build cartesian communicator
+   *********************************************************
+   */
+  int ierr;
+  int rank;
+  std::vector<int> coor(Nd);
+  MPI_Comm communicator;
+  std::vector<int> periodic(Nd,1);
+  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
+  MPI_Comm_rank(communicator,&rank);
+  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
+
+  static int reported;
+  if ( ! reported ) { 
+    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
+	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
+    reported =1 ;
+  }
+  /*********************************************************
+   * Perform halo exchanges
+   *********************************************************
+   */
+  for(int d=0;d<Nd;d++){
+    if ( cart_geom[d]>1 ) {
+      double t0=usecond();
+
+      int from,to;
+      
+      MPI_Barrier(communicator);
+      for(int n=0;n<ncall;n++){
+	
+	void *xmit = (void *)send_bufs[d];
+	void *recv = (void *)recv_bufs[d];
+	
+	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
+	assert(ierr==0);
+	
+	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
+			  recv,bytes,MPI_CHAR,from, from,
+			  communicator,MPI_STATUS_IGNORE);
+	assert(ierr==0);
+	
+	xmit = (void *)send_bufs[Nd+d];
+	recv = (void *)recv_bufs[Nd+d];
+	
+	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
+	assert(ierr==0);
+	
+	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
+			  recv,bytes,MPI_CHAR,from, from,
+			  communicator,MPI_STATUS_IGNORE);
+	assert(ierr==0);
+      }
+      MPI_Barrier(communicator);
+
+      double t1=usecond();
+      
+      double dbytes    = bytes*WorldShmSize;
+      double xbytes    = dbytes*2.0*ncall;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      if ( ! WorldRank ) {
+	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
+      }
+    }
+  }
+  /*********************************************************
+   * Free memory
+   *********************************************************
+   */
+  if ( use_device ) {
+    for(int d=0;d<2*Nd;d++){
+      acceleratorFreeDevice(send_bufs[d]);
+      acceleratorFreeDevice(recv_bufs[d]);
+    }
+  } else {
+    for(int d=0;d<2*Nd;d++){
+      free(send_bufs[d]);
+      free(recv_bufs[d]);
+    }
+  }
+
+}
+
+/**************************************
+ * Command line junk
+ **************************************/
+
+std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
+{
+  char ** itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end) {
+    std::string payload(*itr);
+    return payload;
+  }
+  return std::string("");
+}
+bool CmdOptionExists(char** begin, char** end, const std::string& option)
+{
+  return std::find(begin, end, option) != end;
+}
+void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
+{
+  vec.resize(0);
+  std::stringstream ss(str);
+  int i;
+  while (ss >> i){
+    vec.push_back(i);
+    if(std::ispunct(ss.peek()))
+      ss.ignore();
+  }
+  return;
+}
+/**************************************
+ * Command line junk
+ **************************************/
+int main(int argc, char **argv)
+{
+  std::string arg;
+
+  acceleratorInit();
+
+  MPI_Init(&argc,&argv);
+
+  WorldComm = MPI_COMM_WORLD;
+  
+  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
+
+  MPI_Comm_rank(WorldComm     ,&WorldRank);
+  MPI_Comm_size(WorldComm     ,&WorldSize);
+
+  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
+  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
+
+  if ( WorldSize/WorldShmSize > 2) {
+    printf("This benchmark is meant to run on at most two nodes only\n");
+  }
+
+  auto mpi =std::vector<int>({1,1,1,1});
+
+  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
+    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
+    CmdOptionIntVector(arg,mpi);
+  } else {
+    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
+    exit(0);
+  }
+
+  if( !WorldRank ) {
+    printf("***********************************\n");
+    printf("%d ranks\n",WorldSize); 
+    printf("%d ranks-per-node\n",WorldShmSize);
+    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
+    printf("Cartesian layout: ");
+    for(int d=0;d<mpi.size();d++){
+      printf("%d ",mpi[d]);
+    }
+    printf("\n");fflush(stdout);
+    printf("***********************************\n");
+  }
+
+  
+  if( !WorldRank ) {
+    printf("=========================================================\n");
+    printf("= Benchmarking HOST memory MPI performance               \n");
+    printf("=========================================================\n");fflush(stdout);
+    printf("= L\t pkt bytes\t MB/s           \n");
+    printf("=========================================================\n");fflush(stdout);
+  }
+
+  for(int L=16;L<=64;L+=4){
+    Benchmark(L,mpi,false,100);
+  }  
+
+  if( !WorldRank ) {
+    printf("=========================================================\n");
+    printf("= Benchmarking DEVICE memory MPI performance             \n");
+    printf("=========================================================\n");fflush(stdout);
+  }
+  for(int L=16;L<=64;L+=4){
+    Benchmark(L,mpi,true,100);
+  }  
+
+  if( !WorldRank ) {
+    printf("=========================================================\n");
+    printf("= DONE   \n");
+    printf("=========================================================\n");
+  }
+  MPI_Finalize();
+}
diff --git a/systems/Aurora/benchmarks/bench.pbs b/systems/Aurora/benchmarks/bench.pbs
index a12cde07..1f3dd265 100644
--- a/systems/Aurora/benchmarks/bench.pbs
+++ b/systems/Aurora/benchmarks/bench.pbs
@@ -1,12 +1,10 @@
 #!/bin/bash
 
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-
 #PBS -q EarlyAppAccess
 #PBS -l select=1
 #PBS -l walltime=01:00:00
-##PBS -A Aurora_Deployment
-#PBS -A LatticeQCD_aesp
+#PBS -A LatticeQCD_aesp_CNDA
 
 HDIR=/home/paboyle/
 #module use /soft/testing/modulefiles/
@@ -48,7 +46,6 @@ echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \
+	./Benchmark_memory_bandwidth --mpi 1.1.2.6 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
 $CMD
diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs
index 6c3384dd..c44d8d66 100644
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -6,40 +6,27 @@
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
-
-HDIR=/home/paboyle/
-#module use /soft/testing/modulefiles/
-#module load intel-UMD23.05.25593.11/23.05.25593.11
-#module load tools/pti-gpu  
-#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
-#export PATH=$HDIR/tools/bin:$PATH
-
-export TZ='/usr/share/zoneinfo/US/Central'
-export OMP_PROC_BIND=spread
-export OMP_NUM_THREADS=3
-unset OMP_PLACES
+#export OMP_PROC_BIND=spread
+#export OMP_NUM_THREADS=3
+#unset OMP_PLACES
 
 cd $PBS_O_WORKDIR
 
 source ../sourceme.sh
 
+#echo Jobid: $PBS_JOBID
+#echo Running on host `hostname`
+#echo Running on nodes `cat $PBS_NODEFILE`
 
-echo Jobid: $PBS_JOBID
-echo Running on host `hostname`
-echo Running on nodes `cat $PBS_NODEFILE`
-
-echo NODES
-cat $PBS_NODEFILE
+#echo NODES
+#cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
-NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NDEPTH=3          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 
 NTOTRANKS=$(( NNODES * NRANKS ))
 
-echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
-echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
-
 
 CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
 	     ./gpu_tile_compact.sh \
@@ -60,7 +47,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 2-to-2.comms.hmem1
 
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 2-to-2.comms.hmem0
+$CMD | tee 2-to-2.comms.hmem0
 
 CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
@@ -74,15 +61,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 
 
 CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact4.sh \
+	     ./gpu_tile_compact4a.sh \
 	./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-$CMD | tee 4-to-4.comms.hmem1.nic-affinity
+#$CMD | tee 4-to-4.comms.hmem1.nic-affinity
 
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-$CMD | tee 4-to-4.comms.hmem0.nic-affinity
+$CMD | tee 4-to-4.comms.hmem0
 
+#mpiexec -np 1 --ppn 1 -d 1 numactl -H | tee numa.log
 
 CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
@@ -92,7 +80,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 6-to-6.comms.hmem1
 
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 6-to-6.comms.hmem0
+$CMD | tee 6-to-6.comms.hmem0
 
 
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
@@ -104,4 +92,4 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 12-to-12.comms.hmem1
 
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 12-to-12.comms.hmem0
+$CMD | tee 12-to-12.comms.hmem0
diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh
index 4ea4b113..69ba5107 100755
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -1,65 +1,33 @@
 #!/bin/bash
 
-display_help() {
-  echo " Will map gpu tile to rank in compact and then round-robin fashion"
-  echo " Usage (only work for one node of ATS/PVC):"
-  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
-  echo
-  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
-  echo "   0 Rank 0.0"
-  echo "   1 Rank 0.1"
-  echo "   2 Rank 1.0"
-  echo "   3 Rank 1.1"
-  echo "   4 Rank 2.0"
-  echo "   5 Rank 2.1"
-  echo "   6 Rank 0.0"
-  echo
-  echo " Hacked together by apl@anl.gov, please contact if bug found"
-  exit 1
-}
-
-#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
-#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
-num_gpu=6
-num_tile=2
-
-if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
-  display_help
-fi
-
- 
-
-gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
-tile_id=$((PALS_LOCAL_RANKID % num_tile))
-
-export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
+#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
+
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
+export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
   
-export GRID_MPICH_NIC_BIND=$NIC
+#export GRID_MPICH_NIC_BIND=$NIC
+#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
-export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
-#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+
+#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 
-if [ $PALS_LOCAL_RANKID = 0 ]
-then
-numactl -m $NUMA -N $NUMA  "$@"
-else
-numactl -m $NUMA -N $NUMA "$@"
-fi
+numactl -m $NUMA -N $NUMAP  "$@"
diff --git a/systems/Aurora/benchmarks/gpu_tile_compact4.sh b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
index c157b853..1928c4a2 100755
--- a/systems/Aurora/benchmarks/gpu_tile_compact4.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
@@ -1,39 +1,8 @@
 #!/bin/bash
 
-display_help() {
-  echo " Will map gpu tile to rank in compact and then round-robin fashion"
-  echo " Usage (only work for one node of ATS/PVC):"
-  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
-  echo
-  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
-  echo "   0 Rank 0.0"
-  echo "   1 Rank 0.1"
-  echo "   2 Rank 1.0"
-  echo "   3 Rank 1.1"
-  echo "   4 Rank 2.0"
-  echo "   5 Rank 2.1"
-  echo "   6 Rank 0.0"
-  echo
-  echo " Hacked together by apl@anl.gov, please contact if bug found"
-  exit 1
-}
-
-#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
-#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
-num_gpu=6
-num_tile=2
-
-if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
-  display_help
-fi
-
- 
-
-gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
-tile_id=$((PALS_LOCAL_RANKID % num_tile))
-
-export  NUMA_MAP=(0 0 1 1  0 0  1 1  )
-export  NIC_MAP=(0 1  4 5  0 1  4 5  )
+export  NUMA_MAP=(2 2 3 3  2 2  3 3  )
+export  PROC_MAP=(0 0 1 1  0 0  1 1  )
+export  NIC_MAP=(0 0  4 4  1 1  5 5  )
 export  GPU_MAP=(0 1  3 4  0 1  3 4  )
 export TILE_MAP=(0 0  0 0  1 1  1 1  )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
@@ -41,7 +10,7 @@ export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
   
-export GRID_MPICH_NIC_BIND=$NIC
+#export GRID_MPICH_NIC_BIND=$NIC
 
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
@@ -55,6 +24,6 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
 
-numactl -m $NUMA -N $NUMA    "$@"
+numactl -m $NUMA -N $PROC_MAP  "$@"
diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh
index 0c4e0a6e..7a2b3815 100644
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -7,6 +7,6 @@ export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128