Merge 37d1d87c3c into 2a0d75bac2

2025-06-21 01:02:02 +01:00 · 2023-12-27 01:35:36 +01:00
12 changed files with 250 additions and 465 deletions
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -152,7 +152,6 @@ public:
 #ifdef RNG_FAST_DISCARD
  static void Skip(RngEngine &eng,uint64_t site)
  {
 #if 0
    /////////////////////////////////////////////////////////////////////////////////////
    // Skip by 2^40 elements between successive lattice sites
    // This goes by 10^12.
@ -163,9 +162,9 @@ public:
    // tens of seconds per trajectory so this is clean in all reasonable cases,
    // and margin of safety is orders of magnitude.
    // We could hack Sitmo to skip in the higher order words of state if necessary
-    //
+      //
-    // Replace with 2^30 ; avoid problem on large volumes
+      // Replace with 2^30 ; avoid problem on large volumes
-    //
+      //
    /////////////////////////////////////////////////////////////////////////////////////
    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
    const int shift = 30;
@ -180,9 +179,6 @@ public:
    assert((skip >> shift)==site); // check for overflow
    eng.discard(skip);
 #else
    eng.discardhi(site);
 #endif
    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
  } 
 #endif
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
@ -218,10 +218,6 @@ public:
    // -------------------------------------------------
    // misc
    // -------------------------------------------------
    void discardhi(uint64_t z) {
      _s[3] += z;
      encrypt_counter();
    }
    // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
    // Advances e’s state ei to ei+z by any means equivalent to z
@ -391,4 +387,4 @@ private:
 #undef MIXK
 #undef MIX2
-#endif
+#endif
--- a/MPI_benchmark/bench2.pbs
+++ b/MPI_benchmark/bench2.pbs
@ -1,22 +0,0 @@
 #!/bin/bash
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
 $CMD
--- a/MPI_benchmark/compile-command
+++ b/MPI_benchmark/compile-command
@ -1 +0,0 @@
 mpicxx  -fsycl halo_mpi.cc -o halo_mpi
--- a/MPI_benchmark/gpu_tile_compact.sh
+++ b/MPI_benchmark/gpu_tile_compact.sh
@ -1,30 +0,0 @@
 #!/bin/bash
 export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 #unset EnableWalkerPartition
 #export EnableImplicitScaling=0
 #export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 numactl -m $PNUMA -N $NUMA  "$@"
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@ -1,333 +0,0 @@
 #include <cassert>
 #include <complex>
 #include <memory>
 #include <vector>
 #include <algorithm>
 #include <array>
 #include <string>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <ctime>
 #include <sys/time.h>
 #include <mpi.h>
 /**************************************************************
 * GPU - GPU memory cartesian halo exchange benchmark
 * Config: what is the target
 **************************************************************
 */
 #undef ACC_CUDA
 #undef  ACC_HIP
 #define  ACC_SYCL
 #undef  ACC_NONE
 /**************************************************************
 * Some MPI globals
 **************************************************************
 */
 MPI_Comm WorldComm;
 MPI_Comm WorldShmComm;
 int WorldSize;
 int WorldRank;
 int WorldShmSize;
 int WorldShmRank;
 /**************************************************************
 * Allocate buffers on the GPU, SYCL needs an init call and context
 **************************************************************
 */
 #ifdef ACC_CUDA
 #include <cuda.h>
 void acceleratorInit(void){}
 void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = cudaMalloc((void **)&ptr,bytes);
  assert(err==cudaSuccess);
  return ptr;
 }
 void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
 #endif
 #ifdef ACC_HIP
 #include <hip/hip_runtime.h>
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes)
 {
  void *ptr=NULL;
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
 inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
 #endif
 #ifdef ACC_SYCL
 #include <sycl/CL/sycl.hpp>
 #include <sycl/usm.hpp>
 cl::sycl::queue *theAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
 #if 1
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theAccelerator = new sycl::queue (selectedDevice);
 #else
  cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  };
  theAccelerator = new sycl::queue (selectedDevice);
 #endif
  auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
  printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
 }
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
 #endif
 #ifdef ACC_NONE
 void acceleratorInit(void){}
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
 /**************************************************************
 * Microsecond timer
 **************************************************************
 */
 inline double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
 }
 /**************************************************************
 * Main benchmark routine
 **************************************************************
 */
 void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
 {
  int64_t words = 3*4*2;
  int64_t face,vol;
  int Nd=cart_geom.size();
  /**************************************************************
   * L^Nd volume, L^(Nd-1) faces, 12 complex per site
   * Allocate memory for these
   **************************************************************
   */
  face=1; for( int d=0;d<Nd-1;d++) face = face*L;
  vol=1;  for( int d=0;d<Nd;d++) vol = vol*L;
  std::vector<void *> send_bufs;
  std::vector<void *> recv_bufs;
  size_t vw = face*words;
  size_t bytes = face*words*sizeof(double);
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(acceleratorAllocDevice(bytes));
      recv_bufs.push_back(acceleratorAllocDevice(bytes));
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      send_bufs.push_back(malloc(bytes));
      recv_bufs.push_back(malloc(bytes));
    }
  }
  /*********************************************************
   * Build cartesian communicator
   *********************************************************
   */
  int ierr;
  int rank;
  std::vector<int> coor(Nd);
  MPI_Comm communicator;
  std::vector<int> periodic(Nd,1);
  MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&rank);
  MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
  static int reported;
  if ( ! reported ) { 
    printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
 	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
    reported =1 ;
  }
  /*********************************************************
   * Perform halo exchanges
   *********************************************************
   */
  for(int d=0;d<Nd;d++){
    if ( cart_geom[d]>1 ) {
      double t0=usecond();
      int from,to;
      MPI_Barrier(communicator);
      for(int n=0;n<ncall;n++){
 	void *xmit = (void *)send_bufs[d];
 	void *recv = (void *)recv_bufs[d];
 	ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
 	xmit = (void *)send_bufs[Nd+d];
 	recv = (void *)recv_bufs[Nd+d];
 	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
 	assert(ierr==0);
 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
 			  recv,bytes,MPI_CHAR,from, from,
 			  communicator,MPI_STATUS_IGNORE);
 	assert(ierr==0);
      }
      MPI_Barrier(communicator);
      double t1=usecond();
      double dbytes    = bytes*WorldShmSize;
      double xbytes    = dbytes*2.0*ncall;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
      if ( ! WorldRank ) {
 	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
      }
    }
  }
  /*********************************************************
   * Free memory
   *********************************************************
   */
  if ( use_device ) {
    for(int d=0;d<2*Nd;d++){
      acceleratorFreeDevice(send_bufs[d]);
      acceleratorFreeDevice(recv_bufs[d]);
    }
  } else {
    for(int d=0;d<2*Nd;d++){
      free(send_bufs[d]);
      free(recv_bufs[d]);
    }
  }
 }
 /**************************************
 * Command line junk
 **************************************/
 std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
 {
  char ** itr = std::find(begin, end, option);
  if (itr != end && ++itr != end) {
    std::string payload(*itr);
    return payload;
  }
  return std::string("");
 }
 bool CmdOptionExists(char** begin, char** end, const std::string& option)
 {
  return std::find(begin, end, option) != end;
 }
 void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
 {
  vec.resize(0);
  std::stringstream ss(str);
  int i;
  while (ss >> i){
    vec.push_back(i);
    if(std::ispunct(ss.peek()))
      ss.ignore();
  }
  return;
 }
 /**************************************
 * Command line junk
 **************************************/
 int main(int argc, char **argv)
 {
  std::string arg;
  acceleratorInit();
  MPI_Init(&argc,&argv);
  WorldComm = MPI_COMM_WORLD;
  MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldComm     ,&WorldRank);
  MPI_Comm_size(WorldComm     ,&WorldSize);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  if ( WorldSize/WorldShmSize > 2) {
    printf("This benchmark is meant to run on at most two nodes only\n");
  }
  auto mpi =std::vector<int>({1,1,1,1});
  if( CmdOptionExists(argv,argv+argc,"--mpi") ){
    arg = CmdOptionPayload(argv,argv+argc,"--mpi");
    CmdOptionIntVector(arg,mpi);
  } else {
    printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
    exit(0);
  }
  if( !WorldRank ) {
    printf("***********************************\n");
    printf("%d ranks\n",WorldSize); 
    printf("%d ranks-per-node\n",WorldShmSize);
    printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
    printf("Cartesian layout: ");
    for(int d=0;d<mpi.size();d++){
      printf("%d ",mpi[d]);
    }
    printf("\n");fflush(stdout);
    printf("***********************************\n");
  }
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking HOST memory MPI performance               \n");
    printf("=========================================================\n");fflush(stdout);
    printf("= L\t pkt bytes\t MB/s           \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,false,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= Benchmarking DEVICE memory MPI performance             \n");
    printf("=========================================================\n");fflush(stdout);
  }
  for(int L=16;L<=64;L+=4){
    Benchmark(L,mpi,true,100);
  }  
  if( !WorldRank ) {
    printf("=========================================================\n");
    printf("= DONE   \n");
    printf("=========================================================\n");
  }
  MPI_Finalize();
 }
--- a/systems/Aurora/benchmarks/bench.pbs
+++ b/systems/Aurora/benchmarks/bench.pbs
@ -0,0 +1,54 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=1
 #PBS -l walltime=01:00:00
 ##PBS -A Aurora_Deployment
 #PBS -A LatticeQCD_aesp
 HDIR=/home/paboyle/
 #module use /soft/testing/modulefiles/
 #module load intel-UMD23.05.25593.11/23.05.25593.11
 #module load tools/pti-gpu  
 #export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
 #export PATH=$HDIR/tools/bin:$PATH
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 echo Jobid: $PBS_JOBID
 echo Running on host `hostname`
 echo Running on nodes `cat $PBS_NODEFILE`
 echo NODES
 cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
 echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 #CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 #	     ./gpu_tile_compact.sh \
 #	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
 #	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@ -1,45 +0,0 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD 
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@ -0,0 +1,107 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 HDIR=/home/paboyle/
 #module use /soft/testing/modulefiles/
 #module load intel-UMD23.05.25593.11/23.05.25593.11
 #module load tools/pti-gpu  
 #export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
 #export PATH=$HDIR/tools/bin:$PATH
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 echo Jobid: $PBS_JOBID
 echo Running on host `hostname`
 echo Running on nodes `cat $PBS_NODEFILE`
 echo NODES
 cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
 echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 1-to-1.comms.hmem0
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 1-to-1.comms.hmem1
 CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 2-to-2.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 2-to-2.comms.hmem0
 CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 3-to-3.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 3-to-3.comms.hmem0
 CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact4.sh \
 	./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 $CMD | tee 4-to-4.comms.hmem1.nic-affinity
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 $CMD | tee 4-to-4.comms.hmem0.nic-affinity
 CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 6-to-6.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 6-to-6.comms.hmem0
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 12-to-12.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 12-to-12.comms.hmem0
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@ -1,33 +1,65 @@
 #!/bin/bash
-export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
+display_help() {
-#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+  echo " Will map gpu tile to rank in compact and then round-robin fashion"
-export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
+  echo " Usage (only work for one node of ATS/PVC):"
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
-#export GRID_MPICH_NIC_BIND=$NIC
+export GRID_MPICH_NIC_BIND=$NIC
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
-
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
-#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
-numactl -m $NUMA -N $NUMAP  "$@"
+if [ $PALS_LOCAL_RANKID = 0 ]
 then
 numactl -m $NUMA -N $NUMA  "$@"
 else
 numactl -m $NUMA -N $NUMA "$@"
 fi
--- a/systems/Aurora/benchmarks/gpu_tile_compact4.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh
@ -1,8 +1,39 @@
 #!/bin/bash
-export  NUMA_MAP=(2 2 3 3  2 2  3 3  )
+display_help() {
-export  PROC_MAP=(0 0 1 1  0 0  1 1  )
+  echo " Will map gpu tile to rank in compact and then round-robin fashion"
-export  NIC_MAP=(0 0  4 4  1 1  5 5  )
+  echo " Usage (only work for one node of ATS/PVC):"
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 export  NUMA_MAP=(0 0 1 1  0 0  1 1  )
 export  NIC_MAP=(0 1  4 5  0 1  4 5  )
 export  GPU_MAP=(0 1  3 4  0 1  3 4  )
 export TILE_MAP=(0 0  0 0  1 1  1 1  )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
@ -10,7 +41,7 @@ export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
-#export GRID_MPICH_NIC_BIND=$NIC
+export GRID_MPICH_NIC_BIND=$NIC
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
@ -24,6 +55,6 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
-echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
-numactl -m $NUMA -N $PROC_MAP  "$@"
+numactl -m $NUMA -N $NUMA    "$@"
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@ -7,6 +7,6 @@ export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
-#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
+export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128