From 303b83cdb80ad4e440785854976b34b8d2381d8e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:48:03 +0000 Subject: [PATCH] Scaling benchmarks, verbosity and MPICH aware in acceleratorInit() For some reason Dirichlet benchmark fails on several nodes; need to debug this. --- Grid/threads/Accelerator.cc | 19 ++++- benchmarks/Benchmark_dwf_fp32.cc | 20 +++--- systems/Aurora/benchmarks/bench_scaling.pbs | 80 +++++++++++++++++++++ 3 files changed, 106 insertions(+), 13 deletions(-) create mode 100644 systems/Aurora/benchmarks/bench_scaling.pbs diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 3769b2aa..19411b62 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -7,6 +7,8 @@ uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; +#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID" +#define ENV_RANK_PALS "PALS_RANKID" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" @@ -228,8 +230,17 @@ void acceleratorInit(void) { rank = atoi(localRankStr); } + if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) + { + rank = atoi(localRankStr); + } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);} + + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); auto devices = cl::sycl::device::get_devices(); for(int d = 0;d()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + if ( world_rank == 0) { - GPU_PROP_STR(vendor); - GPU_PROP_STR(version); + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); // GPU_PROP_STR(device_type); /* GPU_PROP(max_compute_units); @@ -259,7 +271,8 @@ void acceleratorInit(void) GPU_PROP(single_fp_config); */ // GPU_PROP(double_fp_config); - GPU_PROP(global_mem_size); + GPU_PROP(global_mem_size); + } } if ( world_rank == 0 ) { diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 37287595..ce4fcfab 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -90,11 +90,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; Benchmark(Ls,Dirichlet); @@ -105,11 +105,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; Benchmark(Ls,Dirichlet); diff --git a/systems/Aurora/benchmarks/bench_scaling.pbs b/systems/Aurora/benchmarks/bench_scaling.pbs new file mode 100644 index 00000000..504fd3e9 --- /dev/null +++ b/systems/Aurora/benchmarks/bench_scaling.pbs @@ -0,0 +1,80 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=32 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.dwf + +CMD="mpiexec -np 48 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 4node.dwf + +CMD="mpiexec -np 96 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 8node.dwf + +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 16node.dwf + + +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 32node.dwf