#!/bin/bash ##SBATCH -A m5294_g #SBATCH -A mp13_g #m3886_g #SBATCH -C gpu #SBATCH -q premium #SBATCH -t 00:10 #SBATCH -c 32 #SBATCH -N 32 #SBATCH -n 128 #SBATCH --ntasks-per-node=4 #SBATCH --gpus-per-task=1 #SBATCH --exclusive #SBATCH --gpu-bind=none export SLURM_CPU_BIND="cores" export MPICH_GPU_SUPPORT_ENABLED=1 export MPICH_RDMA_ENABLED_CUDA=1 export MPICH_GPU_IPC_ENABLED=1 export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 export MPICH_GPU_NO_ASYNC_MEMCPY=0 #export MPICH_SMP_SINGLE_COPY_MODE=CMA cat << EOF > select_gpu #!/bin/bash export GPU_MAP=(0 1 2 3) export NUMA_MAP=( 0 1 2 3 ) export GPU=\$SLURM_LOCALID export NUMA=\$SLURM_LOCALID export CUDA_VISIBLE_DEVICES=\$GPU exec numactl -m \$NUMA -N \$NUMA \$* EOF chmod +x ./select_gpu OPT="--comms-overlap --shm-mpi 0" # # Local volume WAS 32.16.32.24 # # 384 nodes #srun ./select_gpu ./Test_dwf_mixedcg_prec --seconds 300 --grid 128.128.128.288 --mpi 4.8.4.12 --device-mem 16000 --accelerator-threads 8 --shm 2048 $OPT > job.log # 32 nodes, same volume per node srun ./select_gpu ./Test_dwf_mixedcg_prec --seconds 300 --grid 64.32.64.96 --mpi 2.2.2.4 --device-mem 16000 --accelerator-threads 8 --shm 2048 $OPT > job.log