diff --git a/Grid/util/FlightRecorder.h b/Grid/util/FlightRecorder.h index cd3f1c45..8bd8f0b9 100644 --- a/Grid/util/FlightRecorder.h +++ b/Grid/util/FlightRecorder.h @@ -39,6 +39,7 @@ class FlightRecorder { static void Truncate(void); static void ResetCounters(void); static uint64_t ErrorCount(void); + static uint64_t CommsErrorCount(void); static void xmitLog(void *,uint64_t bytes); static void recvLog(void *,uint64_t bytes,int rank); }; diff --git a/systems/Aurora/tests/reproBigJob256.pbs b/systems/Aurora/tests/reproBigJob256.pbs new file mode 100644 index 00000000..8cf75d75 --- /dev/null +++ b/systems/Aurora/tests/reproBigJob256.pbs @@ -0,0 +1,62 @@ +#!/bin/bash + +#PBS -l select=256 +#PBS -q run_next +##PBS -A LatticeQCD_aesp_CNDA +#PBS -A LatticeFlavor +#PBS -l walltime=06:00:00 +#PBS -N reproBigJob +#PBS -k doe +#PBS -l filesystems=flare +#PBS -l filesystems=home + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 + +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=1 +export GRID_CHECKSUM_SEND_BUF=1 + +export MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic +export MPIR_CVAR_NOLOCAL=1 + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cp $PBS_NODEFILE nodefile + +DIR=reproBigJob.$PBS_JOBID + +mkdir -p $DIR +cd $DIR + +cp $PBS_NODEFILE nodefile + +BINARY=../Test_dwf_mixedcg_prec + +echo > pingjob < command-line +env > environment +$CMD +grep Oops */Grid.stderr.* > failures.$PBS_JOBID +