Grid/systems/SDCC-A100/bench.slurm

#!/bin/bash
#SBATCH --partition csi
#SBATCH --time=00:10:00
#SBATCH -A csigeneral
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --qos csi
#SBATCH --gres=gpu:4

source sourceme.sh

cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export CUDA_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU    
exec \$*
EOF
chmod +x ./select_gpu


export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=no
export UCX_MEMTYPE_CACHE=n

export OMP_NUM_THREAD=8
#srun -N1 -n1 nvidia-smi
#srun -N1 -n1 numactl -H > numa.txt
srun -N1 -n1 lstopo A100-topo.pdf

# 4.35 TF/s
#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0  --accelerator-threads 16

srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0  --accelerator-threads 16
SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) 2023-12-04 20:45:57 +00:00			`#!/bin/bash`
			`#SBATCH --partition csi`
			`#SBATCH --time=00:10:00`
			`#SBATCH -A csigeneral`
			`#SBATCH --exclusive`
			`#SBATCH --nodes=1`
			`#SBATCH --ntasks=4`
			`#SBATCH --qos csi`
			`#SBATCH --gres=gpu:4`

			`source sourceme.sh`

			`cat << EOF > select_gpu`
			`#!/bin/bash`
			`export GPU_MAP=(0 1 2 3)`
			`export GPU=\${GPU_MAP[\$SLURM_LOCALID]}`
			`export CUDA_VISIBLE_DEVICES=\$GPU`
			`unset ROCR_VISIBLE_DEVICES`
			`echo RANK \$SLURM_LOCALID using GPU \$GPU`
			`exec \$*`
			`EOF`
			`chmod +x ./select_gpu`


			`export OMP_NUM_THREADS=4`
			`export OMPI_MCA_btl=^uct,openib`
			`export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc`
			`export UCX_RNDV_SCHEME=put_zcopy`
			`export UCX_RNDV_THRESH=16384`
			`export UCX_IB_GPU_DIRECT_RDMA=no`
			`export UCX_MEMTYPE_CACHE=n`

			`export OMP_NUM_THREAD=8`
			`#srun -N1 -n1 nvidia-smi`
			`#srun -N1 -n1 numactl -H > numa.txt`
			`srun -N1 -n1 lstopo A100-topo.pdf`

			`# 4.35 TF/s`
			`#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16`

			`srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16`