diff --git a/Grid/systems/tursa/env.sh b/Grid/systems/tursa/env.sh index 5e6aa7d..8f2e725 100644 --- a/Grid/systems/tursa/env.sh +++ b/Grid/systems/tursa/env.sh @@ -2,5 +2,7 @@ # shellcheck disable=SC1091 env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")" +mkdir -p ~/.config/lattice-benchmarks +echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env source "${env_dir}/spack/share/spack/setup-env.sh" spack load jq git diff --git a/Grid/systems/tursa/files/ompi-gpu.sh b/Grid/systems/tursa/files/ompi-gpu.sh new file mode 100644 index 0000000..e7ffdd3 --- /dev/null +++ b/Grid/systems/tursa/files/ompi-gpu.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# OpenMP/OpenMPI/UCX environment ############################################### +export OMP_NUM_THREADS=8 +export OMPI_MCA_btl=^uct,openib +export OMPI_MCA_pml=ucx +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n + +# IO environment ############################################################### +export OMPI_MCA_io=romio321 +export OMPI_MCA_btl_openib_allow_ib=true +export OMPI_MCA_btl_openib_device_type=infiniband +export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 diff --git a/Grid/systems/tursa/files/run.cpu.template.sh b/Grid/systems/tursa/files/run.cpu.template.sh deleted file mode 100644 index 6e339ac..0000000 --- a/Grid/systems/tursa/files/run.cpu.template.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC1091,SC2050,SC2170 - -## This set of slurm settings assumes that the AMD chips are using bios setting NPS4 (4 mpi taks per socket). - -#SBATCH -J @job-name@ -#SBATCH -A @budget@ -#SBATCH -t 48:00:00 -#SBATCH --nodes=@nnodes@ -#SBATCH --ntasks=@ntasks@ -#SBATCH --ntasks-per-node=8 -#SBATCH --cpus-per-task=32 -#SBATCH --partition=@partition@ -#SBATCH --output=%x.%j.out -#SBATCH --error=%x.%j.err -#SBATCH --qos=standard -#SBATCH --no-requeue - -set -e - -# OpenMP/OpenMPI/UCX environment ############################################### -export OMP_NUM_THREADS=16 -export OMP_DISPLAY_AFFINITY=true -export OMPI_MCA_btl=^uct,openib -export OMPI_MCA_pml=ucx -export UCX_TLS=rc,sm,self -export UCX_RNDV_THRESH=16384 -export UCX_MEMTYPE_CACHE=n -export UCX_NET_DEVICES=mlx5_0:1 - -export OMPI_MCA_BTL_SM_USE_KNEM=1 -export OMPI_MCA_coll_hcoll_enable=1 -export OMPI_MCA_coll_hcoll_np=0 - -# IO environment ############################################################### -if [ @nnodes@ -eq 1 ]; then - export OMPI_MCA_io=ompio -else - export OMPI_MCA_io=romio321 -fi - -export OMPI_MCA_btl_openib_allow_ib=true -export OMPI_MCA_btl_openib_device_type=infiniband -export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 # are these needed here? - -# load environment ############################################################# -env_dir="$(readlink -f @env-dir@)" -source "${env_dir}/env-base.sh" -if [ "${SLURM_JOB_PARTITION}" = 'cpu' ]; then - source "${env_dir}/env-cpu.sh" -else - echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 - exit 1 -fi - -# application and parameters ################################################### -app='@application@' -opt='--comms-overlap --comms-concurrent' -par='@par@' - -# collect job information ###################################################### -job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} -mkdir -p "${job_info_dir}" - -date > "${job_info_dir}/start-date" -set > "${job_info_dir}/env" -ldd ${app} > "${job_info_dir}/ldd" -md5sum ${app} > "${job_info_dir}/app-hash" -readelf -a ${app} > "${job_info_dir}/elf" -echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" -cp "${BASH_SOURCE[0]}" "${job_info_dir}/script" -if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi - -# run! ######################################################################### -mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ - ./cpu-mpi-wrapper.sh \ - ${app} "${par}" "${opt[@]}" \ - --mpi @mpi-geom@ \ - --grid @grid-geom@ \ - --shm 2048 &> "${job_info_dir}/log" - -# if we reach that point the application exited successfully ################### -touch "${job_info_dir}/success" -date > "${job_info_dir}/end-date" - -################################################################################ diff --git a/Grid/systems/tursa/files/run.gpu.16nodes.sh b/Grid/systems/tursa/files/run.gpu.16nodes.sh new file mode 100644 index 0000000..e6e2792 --- /dev/null +++ b/Grid/systems/tursa/files/run.gpu.16nodes.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091,SC2050,SC2170 + +#SBATCH -J benchmark-grid-16 +#SBATCH -t 1:00:00 +#SBATCH --nodes=16 +#SBATCH --ntasks=64 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err +#SBATCH --qos=standard +#SBATCH --no-requeue +#SBATCH --gpu-freq=1410 + +set -euo pipefail + +# load environment ############################################################# +env_cfg="${HOME}/.config/lattice-benchmarks/grid-env" +if [ ! -f "${env_cfg}" ]; then + echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?" + exit 1 +fi +env_dir="$(readlink -f "$(cat "${env_cfg}")")" +source "${env_dir}/env.sh" # load base Spack environment +source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages +source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables + +# application and parameters ################################################### +app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid" + +# collect job information ###################################################### +job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} +mkdir -p "${job_info_dir}" + +date > "${job_info_dir}/start-date" +set > "${job_info_dir}/env" +ldd "${app}" > "${job_info_dir}/ldd" +md5sum "${app}" > "${job_info_dir}/app-hash" +readelf -a "${app}" > "${job_info_dir}/elf" +echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" +cp "${BASH_SOURCE[0]}" "${job_info_dir}/script" + +# run! ######################################################################### +mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ + "${env_dir}/gpu-mpi-wrapper.sh" \ + "${app}" \ + --json-out "${job_info_dir}/result.json" \ + --mpi 1.4.4.4 \ + --accelerator-threads 8 \ + --threads 8 \ + --shm 2048 &> "${job_info_dir}/log" + +# if we reach that point the application exited successfully ################### +touch "${job_info_dir}/success" +date > "${job_info_dir}/end-date" + +################################################################################ diff --git a/Grid/systems/tursa/files/run.gpu.1nodes.sh b/Grid/systems/tursa/files/run.gpu.1nodes.sh new file mode 100644 index 0000000..8f7a92f --- /dev/null +++ b/Grid/systems/tursa/files/run.gpu.1nodes.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091,SC2050,SC2170 + +#SBATCH -J benchmark-grid-1 +#SBATCH -t 1:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err +#SBATCH --qos=standard +#SBATCH --no-requeue +#SBATCH --gpu-freq=1410 + +set -euo pipefail + +# load environment ############################################################# +env_cfg="${HOME}/.config/lattice-benchmarks/grid-env" +if [ ! -f "${env_cfg}" ]; then + echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?" + exit 1 +fi +env_dir="$(readlink -f "$(cat "${env_cfg}")")" +source "${env_dir}/env.sh" # load base Spack environment +source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages +source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables + +# application and parameters ################################################### +app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid" + +# collect job information ###################################################### +job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} +mkdir -p "${job_info_dir}" + +date > "${job_info_dir}/start-date" +set > "${job_info_dir}/env" +ldd "${app}" > "${job_info_dir}/ldd" +md5sum "${app}" > "${job_info_dir}/app-hash" +readelf -a "${app}" > "${job_info_dir}/elf" +echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" +cp "${BASH_SOURCE[0]}" "${job_info_dir}/script" + +# run! ######################################################################### +mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ + "${env_dir}/gpu-mpi-wrapper.sh" \ + "${app}" \ + --json-out "${job_info_dir}/result.json" \ + --mpi 1.1.1.4 \ + --accelerator-threads 8 \ + --threads 8 \ + --shm 2048 &> "${job_info_dir}/log" + +# if we reach that point the application exited successfully ################### +touch "${job_info_dir}/success" +date > "${job_info_dir}/end-date" + +################################################################################ diff --git a/Grid/systems/tursa/files/run.gpu.template.sh b/Grid/systems/tursa/files/run.gpu.template.sh deleted file mode 100644 index 860c856..0000000 --- a/Grid/systems/tursa/files/run.gpu.template.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC1091,SC2050,SC2170 - -# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa - -#SBATCH -J @job-name@ -#SBATCH -A @budget@ -#SBATCH -t 48:00:00 -#SBATCH --nodes=@nnodes@ -#SBATCH --ntasks=@ntasks@ -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=8 -#SBATCH --partition=@partition@ -#SBATCH --gres=gpu:4 -#SBATCH --output=%x.%j.out -#SBATCH --error=%x.%j.err -#SBATCH --qos=standard -#SBATCH --no-requeue - -set -e - -# OpenMP/OpenMPI/UCX environment ############################################### -export OMP_NUM_THREADS=8 -export OMPI_MCA_btl=^uct,openib -export OMPI_MCA_pml=ucx -export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc -export UCX_RNDV_SCHEME=put_zcopy -export UCX_RNDV_THRESH=16384 -export UCX_IB_GPU_DIRECT_RDMA=yes -export UCX_MEMTYPE_CACHE=n - -# IO environment ############################################################### - -if [ @nnodes@ -eq 1 ]; then - export OMPI_MCA_io=ompio -else - export OMPI_MCA_io=romio321 -fi -export OMPI_MCA_btl_openib_allow_ib=true -export OMPI_MCA_btl_openib_device_type=infiniband -export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 - -# load environment ############################################################# -env_dir="$(readlink -f @env-dir@)" -source "${env_dir}/env-base.sh" -if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then - source "${env_dir}/env-gpu.sh" -else - echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2 - exit 1 -fi - -# application and parameters ################################################### -app='@application@' -opt=('--comms-overlap' '--comms-concurrent') -par='@par@' - -# collect job information ###################################################### -job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID} -mkdir -p "${job_info_dir}" - -date > "${job_info_dir}/start-date" -set > "${job_info_dir}/env" -ldd ${app} > "${job_info_dir}/ldd" -md5sum ${app} > "${job_info_dir}/app-hash" -readelf -a ${app} > "${job_info_dir}/elf" -echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes" -cp "${BASH_SOURCE[0]}" "${job_info_dir}/script" -if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi - -# run! ######################################################################### -mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \ - ./gpu-mpi-wrapper.sh \ - ${app} "${par}" "${opt[@]}" \ - --mpi @mpi-geom@ \ - --accelerator-threads 8 \ - --grid @grid-geom@ \ - --shm 2048 &> "${job_info_dir}/log" - -# if we reach that point the application exited successfully ################### -touch "${job_info_dir}/success" -date > "${job_info_dir}/end-date" - -################################################################################