Proper runscript example for Tursa
This commit is contained in:
parent
51eae5723e
commit
43e264d29b
@ -2,5 +2,7 @@
|
|||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
|
|
||||||
env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
env_dir="$(dirname "$(readlink -f "${BASH_SOURCE:-$0}")")"
|
||||||
|
mkdir -p ~/.config/lattice-benchmarks
|
||||||
|
echo "${env_dir}" > ~/.config/lattice-benchmarks/grid-env
|
||||||
source "${env_dir}/spack/share/spack/setup-env.sh"
|
source "${env_dir}/spack/share/spack/setup-env.sh"
|
||||||
spack load jq git
|
spack load jq git
|
||||||
|
17
Grid/systems/tursa/files/ompi-gpu.sh
Normal file
17
Grid/systems/tursa/files/ompi-gpu.sh
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=8
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
@ -1,86 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# shellcheck disable=SC1091,SC2050,SC2170
|
|
||||||
|
|
||||||
## This set of slurm settings assumes that the AMD chips are using bios setting NPS4 (4 mpi taks per socket).
|
|
||||||
|
|
||||||
#SBATCH -J @job-name@
|
|
||||||
#SBATCH -A @budget@
|
|
||||||
#SBATCH -t 48:00:00
|
|
||||||
#SBATCH --nodes=@nnodes@
|
|
||||||
#SBATCH --ntasks=@ntasks@
|
|
||||||
#SBATCH --ntasks-per-node=8
|
|
||||||
#SBATCH --cpus-per-task=32
|
|
||||||
#SBATCH --partition=@partition@
|
|
||||||
#SBATCH --output=%x.%j.out
|
|
||||||
#SBATCH --error=%x.%j.err
|
|
||||||
#SBATCH --qos=standard
|
|
||||||
#SBATCH --no-requeue
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# OpenMP/OpenMPI/UCX environment ###############################################
|
|
||||||
export OMP_NUM_THREADS=16
|
|
||||||
export OMP_DISPLAY_AFFINITY=true
|
|
||||||
export OMPI_MCA_btl=^uct,openib
|
|
||||||
export OMPI_MCA_pml=ucx
|
|
||||||
export UCX_TLS=rc,sm,self
|
|
||||||
export UCX_RNDV_THRESH=16384
|
|
||||||
export UCX_MEMTYPE_CACHE=n
|
|
||||||
export UCX_NET_DEVICES=mlx5_0:1
|
|
||||||
|
|
||||||
export OMPI_MCA_BTL_SM_USE_KNEM=1
|
|
||||||
export OMPI_MCA_coll_hcoll_enable=1
|
|
||||||
export OMPI_MCA_coll_hcoll_np=0
|
|
||||||
|
|
||||||
# IO environment ###############################################################
|
|
||||||
if [ @nnodes@ -eq 1 ]; then
|
|
||||||
export OMPI_MCA_io=ompio
|
|
||||||
else
|
|
||||||
export OMPI_MCA_io=romio321
|
|
||||||
fi
|
|
||||||
|
|
||||||
export OMPI_MCA_btl_openib_allow_ib=true
|
|
||||||
export OMPI_MCA_btl_openib_device_type=infiniband
|
|
||||||
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 # are these needed here?
|
|
||||||
|
|
||||||
# load environment #############################################################
|
|
||||||
env_dir="$(readlink -f @env-dir@)"
|
|
||||||
source "${env_dir}/env-base.sh"
|
|
||||||
if [ "${SLURM_JOB_PARTITION}" = 'cpu' ]; then
|
|
||||||
source "${env_dir}/env-cpu.sh"
|
|
||||||
else
|
|
||||||
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# application and parameters ###################################################
|
|
||||||
app='@application@'
|
|
||||||
opt='--comms-overlap --comms-concurrent'
|
|
||||||
par='@par@'
|
|
||||||
|
|
||||||
# collect job information ######################################################
|
|
||||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
|
||||||
mkdir -p "${job_info_dir}"
|
|
||||||
|
|
||||||
date > "${job_info_dir}/start-date"
|
|
||||||
set > "${job_info_dir}/env"
|
|
||||||
ldd ${app} > "${job_info_dir}/ldd"
|
|
||||||
md5sum ${app} > "${job_info_dir}/app-hash"
|
|
||||||
readelf -a ${app} > "${job_info_dir}/elf"
|
|
||||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
|
||||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
|
||||||
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
|
||||||
|
|
||||||
# run! #########################################################################
|
|
||||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
|
||||||
./cpu-mpi-wrapper.sh \
|
|
||||||
${app} "${par}" "${opt[@]}" \
|
|
||||||
--mpi @mpi-geom@ \
|
|
||||||
--grid @grid-geom@ \
|
|
||||||
--shm 2048 &> "${job_info_dir}/log"
|
|
||||||
|
|
||||||
# if we reach that point the application exited successfully ###################
|
|
||||||
touch "${job_info_dir}/success"
|
|
||||||
date > "${job_info_dir}/end-date"
|
|
||||||
|
|
||||||
################################################################################
|
|
60
Grid/systems/tursa/files/run.gpu.16nodes.sh
Normal file
60
Grid/systems/tursa/files/run.gpu.16nodes.sh
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
#SBATCH -J benchmark-grid-16
|
||||||
|
#SBATCH -t 1:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
#SBATCH --gpu-freq=1410
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||||
|
if [ ! -f "${env_cfg}" ]; then
|
||||||
|
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||||
|
source "${env_dir}/env.sh" # load base Spack environment
|
||||||
|
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||||
|
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd "${app}" > "${job_info_dir}/ldd"
|
||||||
|
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||||
|
"${app}" \
|
||||||
|
--json-out "${job_info_dir}/result.json" \
|
||||||
|
--mpi 1.4.4.4 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--threads 8 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
60
Grid/systems/tursa/files/run.gpu.1nodes.sh
Normal file
60
Grid/systems/tursa/files/run.gpu.1nodes.sh
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
#SBATCH -J benchmark-grid-1
|
||||||
|
#SBATCH -t 1:00:00
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks=4
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --qos=standard
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
#SBATCH --gpu-freq=1410
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_cfg="${HOME}/.config/lattice-benchmarks/grid-env"
|
||||||
|
if [ ! -f "${env_cfg}" ]; then
|
||||||
|
echo "error: ${env_cfg} does not exists, did you execute 'source env.sh' with your user account?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
env_dir="$(readlink -f "$(cat "${env_cfg}")")"
|
||||||
|
source "${env_dir}/env.sh" # load base Spack environment
|
||||||
|
source "${env_dir}/env-gpu.sh" # load GPU-sepcific packages
|
||||||
|
source "${env_dir}/ompi-gpu.sh" # set GPU-specific OpenMPI variables
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app="${env_dir}/prefix/gridbench_gpu/bin/Benchmark_Grid"
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd "${app}" > "${job_info_dir}/ldd"
|
||||||
|
md5sum "${app}" > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a "${app}" > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
"${env_dir}/gpu-mpi-wrapper.sh" \
|
||||||
|
"${app}" \
|
||||||
|
--json-out "${job_info_dir}/result.json" \
|
||||||
|
--mpi 1.1.1.4 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--threads 8 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
################################################################################
|
@ -1,84 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# shellcheck disable=SC1091,SC2050,SC2170
|
|
||||||
|
|
||||||
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
|
||||||
|
|
||||||
#SBATCH -J @job-name@
|
|
||||||
#SBATCH -A @budget@
|
|
||||||
#SBATCH -t 48:00:00
|
|
||||||
#SBATCH --nodes=@nnodes@
|
|
||||||
#SBATCH --ntasks=@ntasks@
|
|
||||||
#SBATCH --ntasks-per-node=4
|
|
||||||
#SBATCH --cpus-per-task=8
|
|
||||||
#SBATCH --partition=@partition@
|
|
||||||
#SBATCH --gres=gpu:4
|
|
||||||
#SBATCH --output=%x.%j.out
|
|
||||||
#SBATCH --error=%x.%j.err
|
|
||||||
#SBATCH --qos=standard
|
|
||||||
#SBATCH --no-requeue
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# OpenMP/OpenMPI/UCX environment ###############################################
|
|
||||||
export OMP_NUM_THREADS=8
|
|
||||||
export OMPI_MCA_btl=^uct,openib
|
|
||||||
export OMPI_MCA_pml=ucx
|
|
||||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
|
||||||
export UCX_RNDV_SCHEME=put_zcopy
|
|
||||||
export UCX_RNDV_THRESH=16384
|
|
||||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
|
||||||
export UCX_MEMTYPE_CACHE=n
|
|
||||||
|
|
||||||
# IO environment ###############################################################
|
|
||||||
|
|
||||||
if [ @nnodes@ -eq 1 ]; then
|
|
||||||
export OMPI_MCA_io=ompio
|
|
||||||
else
|
|
||||||
export OMPI_MCA_io=romio321
|
|
||||||
fi
|
|
||||||
export OMPI_MCA_btl_openib_allow_ib=true
|
|
||||||
export OMPI_MCA_btl_openib_device_type=infiniband
|
|
||||||
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
|
||||||
|
|
||||||
# load environment #############################################################
|
|
||||||
env_dir="$(readlink -f @env-dir@)"
|
|
||||||
source "${env_dir}/env-base.sh"
|
|
||||||
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
|
||||||
source "${env_dir}/env-gpu.sh"
|
|
||||||
else
|
|
||||||
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# application and parameters ###################################################
|
|
||||||
app='@application@'
|
|
||||||
opt=('--comms-overlap' '--comms-concurrent')
|
|
||||||
par='@par@'
|
|
||||||
|
|
||||||
# collect job information ######################################################
|
|
||||||
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
|
||||||
mkdir -p "${job_info_dir}"
|
|
||||||
|
|
||||||
date > "${job_info_dir}/start-date"
|
|
||||||
set > "${job_info_dir}/env"
|
|
||||||
ldd ${app} > "${job_info_dir}/ldd"
|
|
||||||
md5sum ${app} > "${job_info_dir}/app-hash"
|
|
||||||
readelf -a ${app} > "${job_info_dir}/elf"
|
|
||||||
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
|
||||||
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
|
||||||
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
|
||||||
|
|
||||||
# run! #########################################################################
|
|
||||||
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
|
||||||
./gpu-mpi-wrapper.sh \
|
|
||||||
${app} "${par}" "${opt[@]}" \
|
|
||||||
--mpi @mpi-geom@ \
|
|
||||||
--accelerator-threads 8 \
|
|
||||||
--grid @grid-geom@ \
|
|
||||||
--shm 2048 &> "${job_info_dir}/log"
|
|
||||||
|
|
||||||
# if we reach that point the application exited successfully ###################
|
|
||||||
touch "${job_info_dir}/success"
|
|
||||||
date > "${job_info_dir}/end-date"
|
|
||||||
|
|
||||||
################################################################################
|
|
Loading…
Reference in New Issue
Block a user