#!/bin/sh #SBATCH --account=gm2dwf #SBATCH --nodes=16 #SBATCH --ntasks=64 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=12 #SBATCH --time=0:30:00 #SBATCH --partition=booster #SBATCH --gres=gpu:4 export OMP_NUM_THREADS=4 export OMPI_MCA_btl=^uct,openib export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc export UCX_RNDV_SCHEME=put_zcopy export UCX_RNDV_THRESH=16384 export UCX_IB_GPU_DIRECT_RDMA=yes export UCX_MEMTYPE_CACHE=n OPT="--comms-overlap --comms-concurrent" srun -N 16 -n $SLURM_NTASKS \ ./benchmarks/Benchmark_dwf_fp32 \ $OPT \ --mpi 2.2.2.8 \ --accelerator-threads 8 \ --grid 64.64.64.256 \ --shm 2048 > dwf.16node.perf