Aurora MPI standalone benchmake and options that work well

2025-09-18 01:01:04 +01:00 · 2024-02-06 16:28:40 +00:00
parent 2a0d75bac2
commit 5bfa88be85
9 changed files with 426 additions and 118 deletions
--- a/MPI_benchmark/bench2.pbs
+++ b/MPI_benchmark/bench2.pbs
@@ -0,0 +1,22 @@
 #!/bin/bash
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
 $CMD
--- a/MPI_benchmark/compile-command
+++ b/MPI_benchmark/compile-command
@@ -0,0 +1 @@
 mpicxx  -fsycl halo_mpi.cc -o halo_mpi
--- a/MPI_benchmark/gpu_tile_compact.sh
+++ b/MPI_benchmark/gpu_tile_compact.sh
@@ -0,0 +1,30 @@
 #!/bin/bash
 export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 #unset EnableWalkerPartition
 #export EnableImplicitScaling=0
 #export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 numactl -m $PNUMA -N $NUMA  "$@"
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@@ -0,0 +1,333 @@
 #include <cassert>
 #include <complex>
 #include <memory>
 #include <vector>
 #include <algorithm>
 #include <array>
 #include <string>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <ctime>
 #include <sys/time.h>
 #include <mpi.h>
 /**************************************************************
 * GPU - GPU memory cartesian halo exchange benchmark
 * Config: what is the target
 **************************************************************
 */
 #undef ACC_CUDA
 #undef  ACC_HIP
 #define  ACC_SYCL
 #undef  ACC_NONE
 /**************************************************************
 * Some MPI globals
 **************************************************************
 */
 MPI_Comm WorldComm;
 MPI_Comm WorldShmComm;
 int WorldSize;
 int WorldRank;
 int WorldShmSize;
 int WorldShmRank;
 /**************************************************************
 * Allocate buffers on the GPU, SYCL needs an init call and context
 **************************************************************
 */
 #ifdef ACC_CUDA
 #include <cuda.h>
 void acceleratorInit(void){}
 void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = cudaMalloc((void **)&ptr,bytes);
  assert(err==cudaSuccess);
  return ptr;
 }
 void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
 #endif
 #ifdef ACC_HIP
 #include <hip/hip_runtime.h>
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
 inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
 #endif
 #ifdef ACC_SYCL
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 cl::sycl::queue *theAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
 #if 1
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theAccelerator = new sycl::queue (selectedDevice);
 #else
  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
  theAccelerator = new sycl::queue (selectedDevice);
 #endif
  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
 }
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
 #endif
 #ifdef ACC_NONE
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 /**************************************************************
 * Microsecond timer
 **************************************************************
 */
 inline double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
 }
 /**************************************************************
 * Main benchmark routine
 **************************************************************
 */
 void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
 {
  int64_t words = 3*4*2;
  int64_t face,vol;
  int Nd=cart_geom.size();
  /**************************************************************
   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
   * Allocate memory for these
   **************************************************************
   */
  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
  std::vector<void *> send_bufs;
  std::vector<void *> recv_bufs;
  size_t vw = face*words;
  size_t bytes = face*words*sizeof(double);
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(acceleratorAllocDevice(bytes));
      recv_bufs.push_back(acceleratorAllocDevice(bytes));
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(malloc(bytes));
      recv_bufs.push_back(malloc(bytes));
    }
  }
  /*********************************************************
   * Build cartesian communicator
   *********************************************************
   */
  int ierr;
  int rank;
  std::vector<int> coor(Nd);
  MPI_Comm communicator;
  std::vector<int> periodic(Nd,1);
  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&rank);
  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
  static int reported;
  if ( ! reported ) { 
    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
 	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
    reported =1 ;
  }
  /*********************************************************
   * Perform halo exchanges
   *********************************************************
   */
  for(int d=0;d<Nd;d++){
    if ( cart_geom[d]>1 ) {
      double t0=usecond();
      int from,to;
      MPI_Barrier(communicator);
      for(int n=0;n<ncall;n++){
 	void *xmit = (void *)send_bufs[d];
 	void *recv = (void *)recv_bufs[d];
 	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
 	xmit = (void *)send_bufs[Nd+d];
 	recv = (void *)recv_bufs[Nd+d];
 	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
      }
      MPI_Barrier(communicator);
      double t1=usecond();
      double dbytes    = bytes*WorldShmSize;
      double xbytes    = dbytes*2.0*ncall;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      if ( ! WorldRank ) {
 	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
      }
    }
  }
  /*********************************************************
   * Free memory
   *********************************************************
   */
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      acceleratorFreeDevice(send_bufs[d]);
      acceleratorFreeDevice(recv_bufs[d]);
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      free(send_bufs[d]);
      free(recv_bufs[d]);
    }
  }
 }
 /**************************************
 * Command line junk
 **************************************/
 std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
  if (itr != end && ++itr != end) {
    std::string payload(*itr);
    return payload;
  }
  return std::string("");
 }
 bool CmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
 void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
 {
  vec.resize(0);
  std::stringstream ss(str);
  int i;
  while (ss >> i){
    vec.push_back(i);
    if(std::ispunct(ss.peek()))
      ss.ignore();
  }
  return;
 }
 /**************************************
 * Command line junk
 **************************************/
 int main(int argc, char **argv)
 {
  std::string arg;
  acceleratorInit();
  MPI_Init(&argc,&argv);
  WorldComm = MPI_COMM_WORLD;
  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldComm     ,&WorldRank);
  MPI_Comm_size(WorldComm     ,&WorldSize);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  if ( WorldSize/WorldShmSize > 2) {
    printf("This benchmark is meant to run on at most two nodes only\n");
  }
  auto mpi =std::vector<int>({1,1,1,1});
  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
    CmdOptionIntVector(arg,mpi);
  } else {
    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
    exit(0);
  }
  if( !WorldRank ) {
    printf("***********************************\n");
    printf("%d ranks\n",WorldSize); 
    printf("%d ranks-per-node\n",WorldShmSize);
    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
    printf("Cartesian layout: ");
    for(int d=0;d<mpi.size();d++){
      printf("%d ",mpi[d]);
    }
    printf("\n");fflush(stdout);
    printf("***********************************\n");
  }
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking HOST memory MPI performance               \n");
    printf("=========================================================\n");fflush(stdout);
    printf("= L\t pkt bytes\t MB/s           \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,false,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking DEVICE memory MPI performance             \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,true,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= DONE   \n");
    printf("=========================================================\n");
  }
  MPI_Finalize();
 }
--- a/systems/Aurora/benchmarks/bench.pbs
+++ b/systems/Aurora/benchmarks/bench.pbs
@@ -1,12 +1,10 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=1
 #PBS -l walltime=01:00:00
-##PBS -A Aurora_Deployment
+#PBS -A LatticeQCD_aesp_CNDA
 #PBS -A LatticeQCD_aesp
 HDIR=/home/paboyle/
 #module use /soft/testing/modulefiles/
@@ -48,7 +46,6 @@ echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \
+	./Benchmark_memory_bandwidth --mpi 1.1.2.6 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -6,40 +6,27 @@
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
-
+#export OMP_PROC_BIND=spread
-HDIR=/home/paboyle/
+#export OMP_NUM_THREADS=3
-#module use /soft/testing/modulefiles/
+#unset OMP_PLACES
 #module load intel-UMD23.05.25593.11/23.05.25593.11
 #module load tools/pti-gpu  
 #export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
 #export PATH=$HDIR/tools/bin:$PATH
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 #echo Jobid: $PBS_JOBID
 #echo Running on host `hostname`
 #echo Running on nodes `cat $PBS_NODEFILE`
-echo Jobid: $PBS_JOBID
+#echo NODES
-echo Running on host `hostname`
+#cat $PBS_NODEFILE
 echo Running on nodes `cat $PBS_NODEFILE`
 echo NODES
 cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
-NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
+NDEPTH=3          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
 echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
 	     ./gpu_tile_compact.sh \
@@ -60,7 +47,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 2-to-2.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 2-to-2.comms.hmem0
+$CMD | tee 2-to-2.comms.hmem0
 CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
@@ -74,15 +61,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact4.sh \
+	     ./gpu_tile_compact4a.sh \
 	./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-$CMD | tee 4-to-4.comms.hmem1.nic-affinity
+#$CMD | tee 4-to-4.comms.hmem1.nic-affinity
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-$CMD | tee 4-to-4.comms.hmem0.nic-affinity
+$CMD | tee 4-to-4.comms.hmem0
 #mpiexec -np 1 --ppn 1 -d 1 numactl -H | tee numa.log
 CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
@@ -92,7 +80,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 6-to-6.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 6-to-6.comms.hmem0
+$CMD | tee 6-to-6.comms.hmem0
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
@@ -104,4 +92,4 @@ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 12-to-12.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 12-to-12.comms.hmem0
+$CMD | tee 12-to-12.comms.hmem0
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -1,65 +1,33 @@
 #!/bin/bash
-display_help() {
+export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
-  echo " Will map gpu tile to rank in compact and then round-robin fashion"
+#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
-  echo " Usage (only work for one node of ATS/PVC):"
+export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
-export GRID_MPICH_NIC_BIND=$NIC
+#export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
-if [ $PALS_LOCAL_RANKID = 0 ]
+numactl -m $NUMA -N $NUMAP  "$@"
 then
 numactl -m $NUMA -N $NUMA  "$@"
 else
 numactl -m $NUMA -N $NUMA "$@"
 fi
--- a/systems/Aurora/benchmarks/gpu_tile_compact4.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
@@ -1,39 +1,8 @@
 #!/bin/bash
-display_help() {
+export  NUMA_MAP=(2 2 3 3  2 2  3 3  )
-  echo " Will map gpu tile to rank in compact and then round-robin fashion"
+export  PROC_MAP=(0 0 1 1  0 0  1 1  )
-  echo " Usage (only work for one node of ATS/PVC):"
+export  NIC_MAP=(0 0  4 4  1 1  5 5  )
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 export  NUMA_MAP=(0 0 1 1  0 0  1 1  )
 export  NIC_MAP=(0 1  4 5  0 1  4 5  )
 export  GPU_MAP=(0 1  3 4  0 1  3 4  )
 export TILE_MAP=(0 0  0 0  1 1  1 1  )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
@@ -41,7 +10,7 @@ export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
-export GRID_MPICH_NIC_BIND=$NIC
+#export GRID_MPICH_NIC_BIND=$NIC
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
@@ -55,6 +24,6 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
-numactl -m $NUMA -N $NUMA    "$@"
+numactl -m $NUMA -N $PROC_MAP  "$@"
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -7,6 +7,6 @@ export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128