Aurora files

2026-07-19 00:23:28 +01:00 · 2023-12-21 23:20:17 +00:00
parent f48298ad4e
commit 2a0d75bac2
7 changed files with 323 additions and 0 deletions
@@ -0,0 +1,54 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=1
 #PBS -l walltime=01:00:00
 ##PBS -A Aurora_Deployment
 #PBS -A LatticeQCD_aesp
 HDIR=/home/paboyle/
 #module use /soft/testing/modulefiles/
 #module load intel-UMD23.05.25593.11/23.05.25593.11
 #module load tools/pti-gpu  
 #export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
 #export PATH=$HDIR/tools/bin:$PATH
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 echo Jobid: $PBS_JOBID
 echo Running on host `hostname`
 echo Running on nodes `cat $PBS_NODEFILE`
 echo NODES
 cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
 echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 #CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 #	     ./gpu_tile_compact.sh \
 #	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
 #	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 $CMD
@@ -0,0 +1,107 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=2
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 HDIR=/home/paboyle/
 #module use /soft/testing/modulefiles/
 #module load intel-UMD23.05.25593.11/23.05.25593.11
 #module load tools/pti-gpu  
 #export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
 #export PATH=$HDIR/tools/bin:$PATH
 export TZ='/usr/share/zoneinfo/US/Central'
 export OMP_PROC_BIND=spread
 export OMP_NUM_THREADS=3
 unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 echo Jobid: $PBS_JOBID
 echo Running on host `hostname`
 echo Running on nodes `cat $PBS_NODEFILE`
 echo NODES
 cat $PBS_NODEFILE
 NNODES=`wc -l < $PBS_NODEFILE`
 NRANKS=12         # Number of MPI ranks per node
 NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
 NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
 NTOTRANKS=$(( NNODES * NRANKS ))
 echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
 echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
 CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 1-to-1.comms.hmem0
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 1-to-1.comms.hmem1
 CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 2-to-2.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 2-to-2.comms.hmem0
 CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 3-to-3.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 3-to-3.comms.hmem0
 CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact4.sh \
 	./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 $CMD | tee 4-to-4.comms.hmem1.nic-affinity
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 $CMD | tee 4-to-4.comms.hmem0.nic-affinity
 CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 6-to-6.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 6-to-6.comms.hmem0
 CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
 	     ./gpu_tile_compact.sh \
 	./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \
 	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 #$CMD | tee 12-to-12.comms.hmem1
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
 #$CMD | tee 12-to-12.comms.hmem0
@@ -0,0 +1,65 @@
 #!/bin/bash
 display_help() {
  echo " Will map gpu tile to rank in compact and then round-robin fashion"
  echo " Usage (only work for one node of ATS/PVC):"
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 export GRID_MPICH_NIC_BIND=$NIC
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
 if [ $PALS_LOCAL_RANKID = 0 ]
 then
 numactl -m $NUMA -N $NUMA  "$@"
 else
 numactl -m $NUMA -N $NUMA "$@"
 fi
@@ -0,0 +1,60 @@
 #!/bin/bash
 display_help() {
  echo " Will map gpu tile to rank in compact and then round-robin fashion"
  echo " Usage (only work for one node of ATS/PVC):"
  echo "   mpiexec --np N gpu_tile_compact.sh ./a.out"
  echo
  echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
  echo "   0 Rank 0.0"
  echo "   1 Rank 0.1"
  echo "   2 Rank 1.0"
  echo "   3 Rank 1.1"
  echo "   4 Rank 2.0"
  echo "   5 Rank 2.1"
  echo "   6 Rank 0.0"
  echo
  echo " Hacked together by apl@anl.gov, please contact if bug found"
  exit 1
 }
 #This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
 #works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
 num_gpu=6
 num_tile=2
 if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
  display_help
 fi
 gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
 tile_id=$((PALS_LOCAL_RANKID % num_tile))
 export  NUMA_MAP=(0 0 1 1  0 0  1 1  )
 export  NIC_MAP=(0 1  4 5  0 1  4 5  )
 export  GPU_MAP=(0 1  3 4  0 1  3 4  )
 export TILE_MAP=(0 0  0 0  1 1  1 1  )
 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 export GRID_MPICH_NIC_BIND=$NIC
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
 numactl -m $NUMA -N $NUMA    "$@"
@@ -0,0 +1,16 @@
 TOOLS=$HOME/tools
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
@@ -0,0 +1,9 @@
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
@@ -0,0 +1,12 @@
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 module use /soft/modulefiles
 module load intel_compute_runtime/release/agama-devel-682.22
 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 export http_proxy=http://proxy.alcf.anl.gov:3128
 export https_proxy=http://proxy.alcf.anl.gov:3128
 export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 git config --global http.proxy http://proxy.alcf.anl.gov:3128