From 5bfa88be85b454e09ed82a8361f4cdf6846e7823 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 6 Feb 2024 16:28:40 +0000 Subject: [PATCH 1/8] Aurora MPI standalone benchmake and options that work well --- MPI_benchmark/bench2.pbs | 22 ++ MPI_benchmark/compile-command | 1 + MPI_benchmark/gpu_tile_compact.sh | 30 ++ MPI_benchmark/halo_mpi.cc | 333 ++++++++++++++++++ systems/Aurora/benchmarks/bench.pbs | 7 +- systems/Aurora/benchmarks/bench2.pbs | 44 +-- systems/Aurora/benchmarks/gpu_tile_compact.sh | 62 +--- .../Aurora/benchmarks/gpu_tile_compact4.sh | 43 +-- systems/Aurora/sourceme.sh | 2 +- 9 files changed, 426 insertions(+), 118 deletions(-) create mode 100644 MPI_benchmark/bench2.pbs create mode 100644 MPI_benchmark/compile-command create mode 100755 MPI_benchmark/gpu_tile_compact.sh create mode 100644 MPI_benchmark/halo_mpi.cc diff --git a/MPI_benchmark/bench2.pbs b/MPI_benchmark/bench2.pbs new file mode 100644 index 00000000..2c069a20 --- /dev/null +++ b/MPI_benchmark/bench2.pbs @@ -0,0 +1,22 @@ +#!/bin/bash +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +CMD="mpiexec -np 2 -ppn 1 -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1" +$CMD diff --git a/MPI_benchmark/compile-command b/MPI_benchmark/compile-command new file mode 100644 index 00000000..20f26a3c --- /dev/null +++ b/MPI_benchmark/compile-command @@ -0,0 +1 @@ +mpicxx -fsycl halo_mpi.cc -o halo_mpi \ No newline at end of file diff --git a/MPI_benchmark/gpu_tile_compact.sh b/MPI_benchmark/gpu_tile_compact.sh new file mode 100755 index 00000000..28fdb341 --- /dev/null +++ b/MPI_benchmark/gpu_tile_compact.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) + +export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + + +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +#unset EnableWalkerPartition +#export EnableImplicitScaling=0 +#export GRID_MPICH_NIC_BIND=$NIC +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " + +numactl -m $PNUMA -N $NUMA "$@" diff --git a/MPI_benchmark/halo_mpi.cc b/MPI_benchmark/halo_mpi.cc new file mode 100644 index 00000000..9e11c473 --- /dev/null +++ b/MPI_benchmark/halo_mpi.cc @@ -0,0 +1,333 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/************************************************************** + * GPU - GPU memory cartesian halo exchange benchmark + * Config: what is the target + ************************************************************** + */ +#undef ACC_CUDA +#undef ACC_HIP +#define ACC_SYCL +#undef ACC_NONE + +/************************************************************** + * Some MPI globals + ************************************************************** + */ +MPI_Comm WorldComm; +MPI_Comm WorldShmComm; + +int WorldSize; +int WorldRank; + +int WorldShmSize; +int WorldShmRank; + +/************************************************************** + * Allocate buffers on the GPU, SYCL needs an init call and context + ************************************************************** + */ +#ifdef ACC_CUDA +#include +void acceleratorInit(void){} +void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + assert(err==cudaSuccess); + return ptr; +} +void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);} +#endif +#ifdef ACC_HIP +#include +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMalloc((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); + } + return ptr; +}; +inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; +#endif +#ifdef ACC_SYCL +#include +#include +cl::sycl::queue *theAccelerator; +void acceleratorInit(void) +{ + int nDevices = 1; +#if 1 + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theAccelerator = new sycl::queue (selectedDevice); +#else + cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v }; + theAccelerator = new sycl::queue (selectedDevice); +#endif + auto name = theAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout); +} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);}; +#endif +#ifdef ACC_NONE +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr);}; +#endif + + +/************************************************************** + * Microsecond timer + ************************************************************** + */ +inline double usecond(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec; +} +/************************************************************** + * Main benchmark routine + ************************************************************** + */ +void Benchmark(int64_t L,std::vector cart_geom,bool use_device,int ncall) +{ + int64_t words = 3*4*2; + int64_t face,vol; + int Nd=cart_geom.size(); + + /************************************************************** + * L^Nd volume, L^(Nd-1) faces, 12 complex per site + * Allocate memory for these + ************************************************************** + */ + face=1; for( int d=0;d send_bufs; + std::vector recv_bufs; + size_t vw = face*words; + size_t bytes = face*words*sizeof(double); + + if ( use_device ) { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(acceleratorAllocDevice(bytes)); + recv_bufs.push_back(acceleratorAllocDevice(bytes)); + } + } else { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(malloc(bytes)); + recv_bufs.push_back(malloc(bytes)); + } + } + /********************************************************* + * Build cartesian communicator + ********************************************************* + */ + int ierr; + int rank; + std::vector coor(Nd); + MPI_Comm communicator; + std::vector periodic(Nd,1); + MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator); + MPI_Comm_rank(communicator,&rank); + MPI_Cart_coords(communicator,rank,Nd,&coor[0]); + + static int reported; + if ( ! reported ) { + printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank, + coor[0],coor[1],coor[2],coor[3]); fflush(stdout); + reported =1 ; + } + /********************************************************* + * Perform halo exchanges + ********************************************************* + */ + for(int d=0;d1 ) { + double t0=usecond(); + + int from,to; + + MPI_Barrier(communicator); + for(int n=0;n & vec) +{ + vec.resize(0); + std::stringstream ss(str); + int i; + while (ss >> i){ + vec.push_back(i); + if(std::ispunct(ss.peek())) + ss.ignore(); + } + return; +} +/************************************** + * Command line junk + **************************************/ +int main(int argc, char **argv) +{ + std::string arg; + + acceleratorInit(); + + MPI_Init(&argc,&argv); + + WorldComm = MPI_COMM_WORLD; + + MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); + + MPI_Comm_rank(WorldComm ,&WorldRank); + MPI_Comm_size(WorldComm ,&WorldSize); + + MPI_Comm_rank(WorldShmComm ,&WorldShmRank); + MPI_Comm_size(WorldShmComm ,&WorldShmSize); + + if ( WorldSize/WorldShmSize > 2) { + printf("This benchmark is meant to run on at most two nodes only\n"); + } + + auto mpi =std::vector({1,1,1,1}); + + if( CmdOptionExists(argv,argv+argc,"--mpi") ){ + arg = CmdOptionPayload(argv,argv+argc,"--mpi"); + CmdOptionIntVector(arg,mpi); + } else { + printf("Must specify --mpi command line argument\n"); + exit(0); + } + + if( !WorldRank ) { + printf("***********************************\n"); + printf("%d ranks\n",WorldSize); + printf("%d ranks-per-node\n",WorldShmSize); + printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout); + printf("Cartesian layout: "); + for(int d=0;d Date: Tue, 6 Feb 2024 23:45:10 +0000 Subject: [PATCH 2/8] Updated bench script --- systems/Aurora/benchmarks/bench12.pbs | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench12.pbs diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs new file mode 100644 index 00000000..96f6143f --- /dev/null +++ b/systems/Aurora/benchmarks/bench12.pbs @@ -0,0 +1,45 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD From 701991629430c341e623b4d1a174067c1766a201 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 7 Feb 2024 00:56:39 +0000 Subject: [PATCH 3/8] RNG seed change safer for large volumes; this is a long term solution --- Grid/lattice/Lattice_rng.h | 10 ++- Grid/sitmo_rng/sitmo_prng_engine.hpp | 6 +- systems/Aurora/benchmarks/bench.pbs | 51 --------------- systems/Aurora/benchmarks/bench2.pbs | 95 ---------------------------- 4 files changed, 12 insertions(+), 150 deletions(-) delete mode 100644 systems/Aurora/benchmarks/bench.pbs delete mode 100644 systems/Aurora/benchmarks/bench2.pbs diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index b7ef0e82..2212abbe 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -152,6 +152,7 @@ public: #ifdef RNG_FAST_DISCARD static void Skip(RngEngine &eng,uint64_t site) { +#if 0 ///////////////////////////////////////////////////////////////////////////////////// // Skip by 2^40 elements between successive lattice sites // This goes by 10^12. @@ -162,9 +163,9 @@ public: // tens of seconds per trajectory so this is clean in all reasonable cases, // and margin of safety is orders of magnitude. // We could hack Sitmo to skip in the higher order words of state if necessary - // - // Replace with 2^30 ; avoid problem on large volumes - // + // + // Replace with 2^30 ; avoid problem on large volumes + // ///////////////////////////////////////////////////////////////////////////////////// // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init const int shift = 30; @@ -179,6 +180,9 @@ public: assert((skip >> shift)==site); // check for overflow eng.discard(skip); +#else + eng.discardhi(site); +#endif // std::cout << " Engine " < Date: Tue, 13 Feb 2024 18:18:27 +0100 Subject: [PATCH 4/8] missing semicolon generates error with some compilers --- Grid/qcd/smearing/StoutSmearing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h index 641331dc..787ef104 100644 --- a/Grid/qcd/smearing/StoutSmearing.h +++ b/Grid/qcd/smearing/StoutSmearing.h @@ -69,7 +69,7 @@ public: /*! Construct stout smearing object from explicitly specified rho matrix */ Smear_Stout(const std::vector& rho_) : OwnedBase{new Smear_APE(rho_)}, SmearBase{OwnedBase.get()} { - std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl + std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl; assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); } From 585efc6f3fce63f9766b2b66d3ae279ff0944a56 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:40:49 +0000 Subject: [PATCH 5/8] More benchmark scripts --- systems/Aurora/benchmarks/bench256.pbs | 48 ++++++++++++++++++++++++++ systems/Aurora/benchmarks/bench512.pbs | 48 ++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench256.pbs create mode 100644 systems/Aurora/benchmarks/bench512.pbs diff --git a/systems/Aurora/benchmarks/bench256.pbs b/systems/Aurora/benchmarks/bench256.pbs new file mode 100644 index 00000000..405d9ed4 --- /dev/null +++ b/systems/Aurora/benchmarks/bench256.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=256 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 256node.dwf.large diff --git a/systems/Aurora/benchmarks/bench512.pbs b/systems/Aurora/benchmarks/bench512.pbs new file mode 100644 index 00000000..0d8708d3 --- /dev/null +++ b/systems/Aurora/benchmarks/bench512.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=512 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 512node.dwf.large From 1502860004f953d02e2cf8b6d892e1109d940e04 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:47:02 +0000 Subject: [PATCH 6/8] Benchmark scripts --- systems/Aurora/benchmarks/bench1024.pbs | 56 +++++++++++++++++++++++++ systems/Aurora/benchmarks/bench2048.pbs | 56 +++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench1024.pbs create mode 100644 systems/Aurora/benchmarks/bench2048.pbs diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs new file mode 100644 index 00000000..88f0100a --- /dev/null +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=1024 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf.small + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf + + diff --git a/systems/Aurora/benchmarks/bench2048.pbs b/systems/Aurora/benchmarks/bench2048.pbs new file mode 100644 index 00000000..b79081a2 --- /dev/null +++ b/systems/Aurora/benchmarks/bench2048.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2048 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf.small + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf + + From 5ef4da3f29f95b843bf97bce603ba43f1c54029d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:47:36 +0000 Subject: [PATCH 7/8] Silence verbose --- systems/Aurora/benchmarks/gpu_tile_compact.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh index 69ba5107..5cab1ee3 100755 --- a/systems/Aurora/benchmarks/gpu_tile_compact.sh +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -28,6 +28,6 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " +#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " numactl -m $NUMA -N $NUMAP "$@" From 303b83cdb80ad4e440785854976b34b8d2381d8e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:48:03 +0000 Subject: [PATCH 8/8] Scaling benchmarks, verbosity and MPICH aware in acceleratorInit() For some reason Dirichlet benchmark fails on several nodes; need to debug this. --- Grid/threads/Accelerator.cc | 19 ++++- benchmarks/Benchmark_dwf_fp32.cc | 20 +++--- systems/Aurora/benchmarks/bench_scaling.pbs | 80 +++++++++++++++++++++ 3 files changed, 106 insertions(+), 13 deletions(-) create mode 100644 systems/Aurora/benchmarks/bench_scaling.pbs diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 3769b2aa..19411b62 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -7,6 +7,8 @@ uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; +#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID" +#define ENV_RANK_PALS "PALS_RANKID" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" @@ -228,8 +230,17 @@ void acceleratorInit(void) { rank = atoi(localRankStr); } + if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) + { + rank = atoi(localRankStr); + } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);} + + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); auto devices = cl::sycl::device::get_devices(); for(int d = 0;d()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + if ( world_rank == 0) { - GPU_PROP_STR(vendor); - GPU_PROP_STR(version); + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); // GPU_PROP_STR(device_type); /* GPU_PROP(max_compute_units); @@ -259,7 +271,8 @@ void acceleratorInit(void) GPU_PROP(single_fp_config); */ // GPU_PROP(double_fp_config); - GPU_PROP(global_mem_size); + GPU_PROP(global_mem_size); + } } if ( world_rank == 0 ) { diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 37287595..ce4fcfab 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -90,11 +90,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; Benchmark(Ls,Dirichlet); @@ -105,11 +105,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; Benchmark(Ls,Dirichlet); diff --git a/systems/Aurora/benchmarks/bench_scaling.pbs b/systems/Aurora/benchmarks/bench_scaling.pbs new file mode 100644 index 00000000..504fd3e9 --- /dev/null +++ b/systems/Aurora/benchmarks/bench_scaling.pbs @@ -0,0 +1,80 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=32 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.dwf + +CMD="mpiexec -np 48 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 4node.dwf + +CMD="mpiexec -np 96 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 8node.dwf + +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 16node.dwf + + +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 32node.dwf