diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index fa37ae09..5d5314c1 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -2,7 +2,8 @@ ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 -#PBS -l select=16:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=01:00:00 #PBS -N dwf @@ -13,19 +14,14 @@ cd $PBS_O_WORKDIR -#source ../sourceme.sh +source ../sourceme.sh cat $PBS_NODEFILE -#export MPICH_COLL_SYNC=1 -#export MPICH_ENV_DISPLAY=1 -export MPICH_ export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu -#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH +#module load mpich/51.2/icc-all-deterministic-pmix-gpu #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs index 283a9343..f8e52705 100644 --- a/systems/Aurora/tests/repro1gpu.pbs +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -1,6 +1,7 @@ #!/bin/bash -#PBS -l select=16:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 #PBS -N repro1gpu @@ -9,8 +10,9 @@ #export OMP_PROC_BIND=spread #unset OMP_PLACES -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 @@ -34,6 +36,8 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" cd $PBS_O_WORKDIR +source ../sourceme.sh + NN=`cat $PBS_NODEFILE | wc -l` echo $PBS_NODEFILE cat $PBS_NODEFILE diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs new file mode 100644 index 00000000..205fefce --- /dev/null +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -0,0 +1,63 @@ +#!/bin/bash + +#PBS -l select=16 +#PBS -q EarlyAppAccess +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproBigJob +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +DIR=reproBigJob.$PBS_JOBID + +mkdir -p $DIR +cd $DIR + +cp $PBS_NODEFILE nodefile + +CMD="mpiexec -np 192 -ppn 12 -envall --hostfile nodefile \ + ../gpu_tile_compact.sh \ + ../Test_dwf_mixedcg_prec --mpi 4.4.4.3 --grid 128.128.128.96 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +echo $CMD > command-line +env > environment +$CMD +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs index 293e7ade..be10558b 100644 --- a/systems/Aurora/tests/reproN.pbs +++ b/systems/Aurora/tests/reproN.pbs @@ -1,6 +1,7 @@ #!/bin/bash -#PBS -l select=32:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 #PBS -N reproN @@ -9,8 +10,8 @@ #export OMP_PROC_BIND=spread #unset OMP_PLACES -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6