1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-05 19:55:56 +01:00
Grid/systems/Aurora/benchmarks/bench_scaling.pbs
Peter Boyle 303b83cdb8 Scaling benchmarks, verbosity and MPICH aware in acceleratorInit()
For some reason Dirichlet benchmark fails on several nodes; need to
debug this.
2024-02-13 19:48:03 +00:00

81 lines
2.6 KiB
Bash
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=32
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 32 nodes, 384 ranks
#
CMD="mpiexec -np 384 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1node.dwf
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 2node.dwf
CMD="mpiexec -np 48 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 4node.dwf
CMD="mpiexec -np 96 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 8node.dwf
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 16node.dwf
CMD="mpiexec -np 384 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 32node.dwf