From 14643c0aab28c0b78f2cff1718bb454ceacd95f6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 4 Dec 2023 15:45:57 -0500 Subject: [PATCH] SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) --- benchmarks/Benchmark_dwf_fp32.cc | 9 +++++++ systems/SDCC-A100/bench.slurm | 42 ++++++++++++++++++++++++++++++++ systems/SDCC-A100/config-command | 2 +- systems/SDCC-ICE/bench.slurm | 31 +++++++++++++++++++++++ systems/SDCC-ICE/config-command | 11 ++++++--- systems/SDCC-ICE/sourceme.sh | 1 + 6 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 systems/SDCC-A100/bench.slurm create mode 100644 systems/SDCC-ICE/bench.slurm diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index ae7cabec..37287595 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) GaugeField Umu(UGrid); GaugeField UmuCopy(UGrid); SU::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(Umu); UmuCopy=Umu; std::cout << GridLogMessage << "Random gauge initialised " << std::endl; @@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet) if(( n2e>1.0e-4) ) { std::cout<Barrier(); + std::cout<Barrier(); exit(-1); } assert (n2e< 1.0e-4 ); diff --git a/systems/SDCC-A100/bench.slurm b/systems/SDCC-A100/bench.slurm new file mode 100644 index 00000000..04d1e1e2 --- /dev/null +++ b/systems/SDCC-A100/bench.slurm @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --partition csi +#SBATCH --time=00:10:00 +#SBATCH -A csigeneral +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --qos csi +#SBATCH --gres=gpu:4 + +source sourceme.sh + +cat << EOF > select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export CUDA_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec \$* +EOF +chmod +x ./select_gpu + + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_MEMTYPE_CACHE=n + +export OMP_NUM_THREAD=8 +#srun -N1 -n1 nvidia-smi +#srun -N1 -n1 numactl -H > numa.txt +srun -N1 -n1 lstopo A100-topo.pdf + +# 4.35 TF/s +#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + +srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + diff --git a/systems/SDCC-A100/config-command b/systems/SDCC-A100/config-command index cb773e7a..26ad5377 100644 --- a/systems/SDCC-A100/config-command +++ b/systems/SDCC-A100/config-command @@ -5,7 +5,7 @@ --enable-accelerator=cuda \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ ---enable-accelerator-cshift \ +--disable-accelerator-cshift \ --disable-fermion-reps \ --disable-gparity \ CXX=nvcc \ diff --git a/systems/SDCC-ICE/bench.slurm b/systems/SDCC-ICE/bench.slurm new file mode 100644 index 00000000..76beb828 --- /dev/null +++ b/systems/SDCC-ICE/bench.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:20:00 +#SBATCH -A lqcdtest +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=2 +#SBATCH --qos lqcd + +source sourceme.sh + +export OMP_NUM_THREAD=24 +#srun -N1 -n1 numactl -H > numa.txt +#srun -N1 -n1 lstopo ice-topo.pdf + +cat << EOF > select_socket +#!/bin/bash +export NUM_MAP=(0 1) +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +exec \$* +EOF +chmod +x ./select_socket + +#for vol in 8.8.8.16 8.8.8.32 8.8.8.64 +#for vol in 8.8.16.16 8.8.16.32 8.8.16.64 +for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32 +do +srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out +srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out +done + diff --git a/systems/SDCC-ICE/config-command b/systems/SDCC-ICE/config-command index 28e560e3..bc28c96d 100644 --- a/systems/SDCC-ICE/config-command +++ b/systems/SDCC-ICE/config-command @@ -1,13 +1,18 @@ ../../configure \ ---enable-comms=mpi \ +--enable-debug \ +--enable-comms=mpi-auto \ --enable-unified=yes \ --enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ --enable-accelerator=none \ ---enable-simd=AVX2 \ +--enable-simd=AVX512 \ --disable-accelerator-cshift \ --disable-fermion-reps \ --disable-gparity \ -CXX=mpicxx \ +CXX=clang++ \ +MPICXX=mpicxx \ +LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \ +LIBS=-lhwloc \ CXXFLAGS="-std=c++17" diff --git a/systems/SDCC-ICE/sourceme.sh b/systems/SDCC-ICE/sourceme.sh index a620dea5..6263063c 100644 --- a/systems/SDCC-ICE/sourceme.sh +++ b/systems/SDCC-ICE/sourceme.sh @@ -1 +1,2 @@ +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH module load openmpi