diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp index 3878c1a..a4db975 100644 --- a/Grid/Benchmark_Grid.cpp +++ b/Grid/Benchmark_Grid.cpp @@ -1,6 +1,7 @@ /* Copyright © 2015 Peter Boyle Copyright © 2022 Antonin Portelli +Copyright © 2022 Simon Buerger This is a fork of Benchmark_ITT.cpp from Grid @@ -24,13 +25,6 @@ along with this program. If not, see . using namespace Grid; -std::vector L_list; -std::vector Ls_list; -std::vector mflop_list; - -double mflop_ref; -double mflop_ref_err; - int NN_global; nlohmann::json json_results; @@ -58,18 +52,6 @@ struct time_statistics } }; -void comms_header() -{ - std::cout << GridLogMessage << " L " - << "\t" - << " Ls " - << "\t" - << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl; -}; - -Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT}; - struct controls { int Opt; @@ -133,10 +115,9 @@ class Benchmark std::vector t_time(Nloop); time_statistics timestat; - grid_big_sep(); - std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " + std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl; - grid_big_sep(); + grid_small_sep(); grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)", "time (usec)", "rate (GB/s)", "std dev", "max"); @@ -368,10 +349,10 @@ class Benchmark RealD mass = 0.1; RealD M5 = 1.8; - double mflops; - double mflops_best = 0; - double mflops_worst = 0; - std::vector mflops_all; + double gflops; + double gflops_best = 0; + double gflops_worst = 0; + std::vector gflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size @@ -486,8 +467,6 @@ class Benchmark FGrid->Broadcast(0, &ncall, sizeof(ncall)); - // std::cout << GridLogMessage << " Estimate " << ncall << " calls per - // second"< mflops_best) - mflops_best = mflops; - if (mflops < mflops_worst) - mflops_worst = mflops; + gflops = flops / timestat.mean / 1000.; + gflops_all.push_back(gflops); + if (gflops_best == 0) + gflops_best = gflops; + if (gflops_worst == 0) + gflops_worst = gflops; + if (gflops > gflops_best) + gflops_best = gflops; + if (gflops < gflops_worst) + gflops_worst = gflops; std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1) - << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo - << "-" << mf_hi << std::endl; + << "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo + << "-" << gf_hi << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1) - << "Deo mflop/s per rank " << mflops / NP << std::endl; + << "Deo Gflop/s per rank " << gflops / NP << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1) - << "Deo mflop/s per node " << mflops / NN << std::endl; + << "Deo Gflop/s per node " << gflops / NN << std::endl; } grid_small_sep(); std::cout << GridLogMessage << L << "^4 x " << Ls - << " Deo Best mflop/s = " << mflops_best << " ; " - << mflops_best / NN << " per node " << std::endl; + << " Deo Best Gflop/s = " << gflops_best << " ; " + << gflops_best / NN << " per node " << std::endl; std::cout << GridLogMessage << L << "^4 x " << Ls - << " Deo Worst mflop/s = " << mflops_worst << " ; " - << mflops_worst / NN << " per node " << std::endl; + << " Deo Worst Gflop/s = " << gflops_worst << " ; " + << gflops_worst / NN << " per node " << std::endl; std::cout << GridLogMessage << fmt << std::endl; std::cout << GridLogMessage; - for (int i = 0; i < mflops_all.size(); i++) + for (int i = 0; i < gflops_all.size(); i++) { - std::cout << mflops_all[i] / NN << " ; "; + std::cout << gflops_all[i] / NN << " ; "; } std::cout << std::endl; } - return mflops_best; + return gflops_best; } static double Staggered(int L) { - double mflops; - double mflops_best = 0; - double mflops_worst = 0; - std::vector mflops_all; + double gflops; + double gflops_best = 0; + double gflops_worst = 0; + std::vector gflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size @@ -700,51 +679,51 @@ class Benchmark double volume = 1; for (int mu = 0; mu < Nd; mu++) volume = volume * latt4[mu]; - double flops = (1146.0 * volume) / 2; - double mf_hi, mf_lo, mf_err; + double flops = (1146.0 * volume) / 2.; + double gf_hi, gf_lo, gf_err; timestat.statistics(t_time); - mf_hi = flops / timestat.min; - mf_lo = flops / timestat.max; - mf_err = flops / timestat.min * timestat.err / timestat.mean; + gf_hi = flops / timestat.min / 1000.; + gf_lo = flops / timestat.max / 1000.; + gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.; - mflops = flops / timestat.mean; - mflops_all.push_back(mflops); - if (mflops_best == 0) - mflops_best = mflops; - if (mflops_worst == 0) - mflops_worst = mflops; - if (mflops > mflops_best) - mflops_best = mflops; - if (mflops < mflops_worst) - mflops_worst = mflops; + gflops = flops / timestat.mean / 1000.; + gflops_all.push_back(gflops); + if (gflops_best == 0) + gflops_best = gflops; + if (gflops_worst == 0) + gflops_worst = gflops; + if (gflops > gflops_best) + gflops_best = gflops; + if (gflops < gflops_worst) + gflops_worst = gflops; std::cout << GridLogMessage << std::fixed << std::setprecision(1) - << "Deo mflop/s = " << mflops << " (" << mf_err << ") " << mf_lo - << "-" << mf_hi << std::endl; + << "Deo Gflop/s = " << gflops << " (" << gf_err << ") " << gf_lo + << "-" << gf_hi << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1) - << "Deo mflop/s per rank " << mflops / NP << std::endl; + << "Deo Gflop/s per rank " << gflops / NP << std::endl; std::cout << GridLogMessage << std::fixed << std::setprecision(1) - << "Deo mflop/s per node " << mflops / NN << std::endl; + << "Deo Gflop/s per node " << gflops / NN << std::endl; } grid_small_sep(); std::cout << GridLogMessage << L - << "^4 Deo Best mflop/s = " << mflops_best << " ; " - << mflops_best / NN << " per node " << std::endl; + << "^4 Deo Best Gflop/s = " << gflops_best << " ; " + << gflops_best / NN << " per node " << std::endl; std::cout << GridLogMessage << L - << "^4 Deo Worst mflop/s = " << mflops_worst << " ; " - << mflops_worst / NN << " per node " << std::endl; + << "^4 Deo Worst Gflop/s = " << gflops_worst << " ; " + << gflops_worst / NN << " per node " << std::endl; std::cout << GridLogMessage << fmt << std::endl; std::cout << GridLogMessage; - for (int i = 0; i < mflops_all.size(); i++) + for (int i = 0; i < gflops_all.size(); i++) { - std::cout << mflops_all[i] / NN << " ; "; + std::cout << gflops_all[i] / NN << " ; "; } std::cout << std::endl; } - return mflops_best; + return gflops_best; } }; @@ -782,6 +761,30 @@ int main(int argc, char **argv) std::vector dwf4; std::vector staggered; + if (do_memory) + { + grid_big_sep(); + std::cout << GridLogMessage << " Memory benchmark " << std::endl; + grid_big_sep(); + Benchmark::Memory(); + } + + if (do_su4) + { + grid_big_sep(); + std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; + grid_big_sep(); + Benchmark::SU4(); + } + + if (do_comms) + { + grid_big_sep(); + std::cout << GridLogMessage << " Communications benchmark " << std::endl; + grid_big_sep(); + Benchmark::Comms(); + } + if (do_flops) { Ls = 1; @@ -810,68 +813,35 @@ int main(int argc, char **argv) staggered.push_back(result); } + int NN = NN_global; + grid_big_sep(); - std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl; + std::cout << GridLogMessage << "Gflop/s/node Summary table Ls=" << Ls << std::endl; grid_big_sep(); - std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl; + grid_printf("%5s %12s %12s %12s\n", "L", "Wilson", "DWF", "Staggered"); + nlohmann::json tmp_flops; for (int l = 0; l < L_list.size(); l++) { - std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " - << dwf4[l] << " \t\t " << staggered[l] << std::endl; + grid_printf("%5d %12.2f %12.2f %12.2f\n", L_list[l], wilson[l] / NN, dwf4[l] / NN, + staggered[l] / NN); + nlohmann::json tmp; tmp["L"] = L_list[l]; - tmp["Mflops_wilson"] = wilson[l]; - tmp["Mflops_dwf4"] = dwf4[l]; - tmp["Mflops_staggered"] = staggered[l]; - json_results["flops"].push_back(tmp); - } - } - - int NN = NN_global; - if (do_memory) - { - grid_big_sep(); - std::cout << GridLogMessage << " Memory benchmark " << std::endl; - grid_big_sep(); - Benchmark::Memory(); - } - - if (do_su4) - { - grid_big_sep(); - std::cout << GridLogMessage << " SU(4) benchmark " << std::endl; - grid_big_sep(); - Benchmark::SU4(); - } - - if (do_comms) - { - grid_big_sep(); - std::cout << GridLogMessage << " Communications benchmark " << std::endl; - grid_big_sep(); - Benchmark::Comms(); - } - - if (do_flops) - { - grid_big_sep(); - std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl; - grid_big_sep(); - std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl; - for (int l = 0; l < L_list.size(); l++) - { - std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " - << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl; + tmp["Gflops_wilson"] = wilson[l] / NN; + tmp["Gflops_dwf4"] = dwf4[l] / NN; + tmp["Gflops_staggered"] = staggered[l] / NN; + tmp_flops["results"].push_back(tmp); } grid_big_sep(); std::cout << GridLogMessage << " Comparison point result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN - << " Mflop/s per node" << std::endl; + << " Gflop/s per node" << std::endl; std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl; std::cout << std::setprecision(3); grid_big_sep(); - json_results["comp_point_Mflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN; + tmp_flops["comparison_point_Gflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN; + json_results["flops"] = tmp_flops; } if (!json_filename.empty()) diff --git a/Grid/systems/tursa/files/run.cpu.template.sh b/Grid/systems/tursa/files/run.cpu.template.sh new file mode 100644 index 0000000..6e339ac --- /dev/null +++ b/Grid/systems/tursa/files/run.cpu.template.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091,SC2050,SC2170 + +## This set of slurm settings assumes that the AMD chips are using bios setting NPS4 (4 mpi taks per socket). + +#SBATCH -J @job-name@ +#SBATCH -A @budget@ +#SBATCH -t 48:00:00 +#SBATCH --nodes=@nnodes@ +#SBATCH --ntasks=@ntasks@ +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=32 +#SBATCH --partition=@partition@ +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err +#SBATCH --qos=standard +#SBATCH --no-requeue + +set -e + +# OpenMP/OpenMPI/UCX environment ############################################### +export OMP_NUM_THREADS=16 +export OMP_DISPLAY_AFFINITY=true +export OMPI_MCA_btl=^uct,openib +export OMPI_MCA_pml=ucx +export UCX_TLS=rc,sm,self +export UCX_RNDV_THRESH=16384 +export UCX_MEMTYPE_CACHE=n +export UCX_NET_DEVICES=mlx5_0:1 + +export OMPI_MCA_BTL_SM_USE_KNEM=1 +export OMPI_MCA_coll_hcoll_enable=1 +export OMPI_MCA_coll_hcoll_np=0 + +# IO environment ############################################################### +if [ @nnodes@ -eq 1 ]; then + export OMPI_MCA_io=ompio +else + export OMPI_MCA_io=romio321 +fi + +export OMPI_MCA_btl_openib_allow_ib=true +export OMPI_MCA_btl_openib_device_type=infiniband +export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 # are these needed here? + +# load environment ############################################################# +env_dir="$(readlink -f @env-dir@)" +source "${env_dir}/env-base.sh" +if [ "${SLURM_JOB_PARTITION}" = 'cpu' ]; then + source "${env_dir}/env-cpu.sh" +else + echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 + exit 1 +fi + +# application and parameters ################################################### +app='@application@' +opt='--comms-overlap --comms-concurrent' +par='@par@' + +# collect job information ###################################################### +job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} +mkdir -p "${job_info_dir}" + +date > "${job_info_dir}/start-date" +set > "${job_info_dir}/env" +ldd ${app} > "${job_info_dir}/ldd" +md5sum ${app} > "${job_info_dir}/app-hash" +readelf -a ${app} > "${job_info_dir}/elf" +echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" +cp "${BASH_SOURCE[0]}" "${job_info_dir}/script" +if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi + +# run! ######################################################################### +mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ + ./cpu-mpi-wrapper.sh \ + ${app} "${par}" "${opt[@]}" \ + --mpi @mpi-geom@ \ + --grid @grid-geom@ \ + --shm 2048 &> "${job_info_dir}/log" + +# if we reach that point the application exited successfully ################### +touch "${job_info_dir}/success" +date > "${job_info_dir}/end-date" + +################################################################################ diff --git a/Grid/systems/tursa/files/run.gpu.template.sh b/Grid/systems/tursa/files/run.gpu.template.sh new file mode 100644 index 0000000..860c856 --- /dev/null +++ b/Grid/systems/tursa/files/run.gpu.template.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091,SC2050,SC2170 + +# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa + +#SBATCH -J @job-name@ +#SBATCH -A @budget@ +#SBATCH -t 48:00:00 +#SBATCH --nodes=@nnodes@ +#SBATCH --ntasks=@ntasks@ +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --partition=@partition@ +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err +#SBATCH --qos=standard +#SBATCH --no-requeue + +set -e + +# OpenMP/OpenMPI/UCX environment ############################################### +export OMP_NUM_THREADS=8 +export OMPI_MCA_btl=^uct,openib +export OMPI_MCA_pml=ucx +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n + +# IO environment ############################################################### + +if [ @nnodes@ -eq 1 ]; then + export OMPI_MCA_io=ompio +else + export OMPI_MCA_io=romio321 +fi +export OMPI_MCA_btl_openib_allow_ib=true +export OMPI_MCA_btl_openib_device_type=infiniband +export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 + +# load environment ############################################################# +env_dir="$(readlink -f @env-dir@)" +source "${env_dir}/env-base.sh" +if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then + source "${env_dir}/env-gpu.sh" +else + echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 + exit 1 +fi + +# application and parameters ################################################### +app='@application@' +opt=('--comms-overlap' '--comms-concurrent') +par='@par@' + +# collect job information ###################################################### +job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} +mkdir -p "${job_info_dir}" + +date > "${job_info_dir}/start-date" +set > "${job_info_dir}/env" +ldd ${app} > "${job_info_dir}/ldd" +md5sum ${app} > "${job_info_dir}/app-hash" +readelf -a ${app} > "${job_info_dir}/elf" +echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" +cp "${BASH_SOURCE[0]}" "${job_info_dir}/script" +if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi + +# run! ######################################################################### +mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ + ./gpu-mpi-wrapper.sh \ + ${app} "${par}" "${opt[@]}" \ + --mpi @mpi-geom@ \ + --accelerator-threads 8 \ + --grid @grid-geom@ \ + --shm 2048 &> "${job_info_dir}/log" + +# if we reach that point the application exited successfully ################### +touch "${job_info_dir}/success" +date > "${job_info_dir}/end-date" + +################################################################################