From cd52e3cbc2567d8df96a165efb1a2967759163eb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:38:15 +0000 Subject: [PATCH] Jobs on subspot --- systems/Aurora/benchmarks/bench1.pbs | 67 ++++++++++++++++++++++++++++ systems/Aurora/benchmarks/bench2.pbs | 55 +++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench1.pbs create mode 100644 systems/Aurora/benchmarks/bench2.pbs diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs new file mode 100644 index 00000000..49bc0b24 --- /dev/null +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -0,0 +1,67 @@ +#!/bin/bash + +#PBS -q debug +#PBS -l select=1 +#PBS -l walltime=00:20:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh +module load pti-gpu + +#cat $PBS_NODEFILE + +export OMP_NUM_THREADS=4 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 2 nodes, 24 ranks +# +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +#$CMD | tee 1node.comms + + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " +#$CMD | tee 1tile.dwf + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.32.32.32.48.dwf + + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 1node.64.64.32.96.dwf + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 1node.64.32.32.48.dwf + diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs new file mode 100644 index 00000000..ea469cda --- /dev/null +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -0,0 +1,55 @@ +#!/bin/bash + +#PBS -q workq +#PBS -l select=2 +#PBS -l walltime=00:20:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh +module load pti-gpu + +#cat $PBS_NODEFILE + +export OMP_NUM_THREADS=4 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 2 nodes, 24 ranks +# +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +$CMD | tee 2node.comms + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.32.32.64.48.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.64.64.64.96.dwf +