From fab1efb48c97fd5ce4e9b611500e6e7b32718c8a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 19 Mar 2024 14:36:21 +0000 Subject: [PATCH] More britney logging improvements --- Grid/lattice/Lattice_reduction.h | 4 +- Grid/util/Init.cc | 22 ++++++-- Grid/util/Init.h | 8 +-- systems/Aurora/tests/repro16.pbs | 36 ++++++++++--- systems/Aurora/tests/repro1gpu.pbs | 81 ++++++++++++++++++++++++++++++ systems/Aurora/tests/reproN.pbs | 78 ++++++++++++++++++++++++++++ tests/Test_dwf_mixedcg_prec.cc | 45 +++++++++++++++-- 7 files changed, 253 insertions(+), 21 deletions(-) create mode 100644 systems/Aurora/tests/repro1gpu.pbs create mode 100644 systems/Aurora/tests/reproN.pbs diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index ad9d9942..969a4a10 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -285,9 +285,11 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); + uint32_t csum=0; + // Uint32Checksum(left,csum); ComplexD nrm = rankInnerProduct(left,right); RealD local = real(nrm); - GridNormLog(real(nrm)); // Could log before and after global sum to distinguish local and MPI + GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI grid->GlobalSum(nrm); GridMPINormLog(local,real(nrm)); return nrm; diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index ccc47cc9..18a3d5fe 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -99,6 +99,7 @@ int32_t GridNormLoggingCounter; int32_t GridMPINormLoggingCounter; std::vector GridNormLogVector; std::vector GridMPINormLogVector; +std::vector GridCsumLogVector; void SetGridNormLoggingMode(GridNormLoggingMode_t mode) { @@ -117,6 +118,8 @@ void SetGridNormLoggingMode(GridNormLoggingMode_t mode) GridNormLoggingCounter=0; GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); + GridCsumLogVector.resize(0); + GridMPINormLogVector.resize(0); break; default: assert(0); @@ -129,6 +132,8 @@ void SetGridNormLoggingModePrint(void) GridNormLoggingCounter = 0; GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); + GridCsumLogVector.resize(0); + GridMPINormLogVector.resize(0); GridNormLoggingMode = GridNormLoggingModePrint; } void SetGridNormLoggingModeRecord(void) @@ -137,6 +142,8 @@ void SetGridNormLoggingModeRecord(void) GridNormLoggingCounter = 0; GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); + GridCsumLogVector.resize(0); + GridMPINormLogVector.resize(0); GridNormLoggingMode = GridNormLoggingModeRecord; } void SetGridNormLoggingModeVerify(void) @@ -146,24 +153,29 @@ void SetGridNormLoggingModeVerify(void) GridMPINormLoggingCounter=0; GridNormLoggingMode = GridNormLoggingModeVerify; } -void GridNormLog(double value) +void GridNormLog(double value,uint32_t csum) { if(GridNormLoggingMode == GridNormLoggingModePrint) { - std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value < GridNormLogVector; +//extern int GridNormLoggingMode; +//extern int32_t GridNormLoggingCounter; +//extern std::vector GridNormLogVector; void SetGridNormLoggingModePrint(void); void SetGridNormLoggingModeRecord(void); void SetGridNormLoggingModeVerify(void); void SetGridNormLoggingMode(GridNormLoggingMode_t mode); -void GridNormLog(double value); +void GridNormLog(double value,uint32_t csum); void GridMPINormLog(double lcl, double glbl); NAMESPACE_END(Grid); diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index c15ced99..fa37ae09 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -2,26 +2,39 @@ ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 -#PBS -q EarlyAppAccess -#PBS -l select=16 -#PBS -l walltime=02:00:00 +#PBS -l select=16:system=sunspot,place=scatter #PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=01:00:00 +#PBS -N dwf +#PBS -k doe #export OMP_PROC_BIND=spread #unset OMP_PLACES cd $PBS_O_WORKDIR -source ../sourceme.sh +#source ../sourceme.sh cat $PBS_NODEFILE +#export MPICH_COLL_SYNC=1 +#export MPICH_ENV_DISPLAY=1 +export MPICH_ export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu +#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 @@ -32,10 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU -# 12 ppn, 16 nodes, 192 ranks +DIR=repro.$PBS_JOBID +mkdir $DIR +cd $DIR + CMD="mpiexec -np 192 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ - --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 " + ../gpu_tile_compact.sh \ + ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative" #--comms-overlap $CMD + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs new file mode 100644 index 00000000..3b95b404 --- /dev/null +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -0,0 +1,81 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N repro1gpu +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + + +for g in {0..11} +do +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) + +export numa=${NUMA_MAP[$g]} +export gpu_id=${GPU_MAP[$g]} +export tile_id=${TILE_MAP[$g]} +export gpu=$gpu_id.$tile_id + +cd $PBS_O_WORKDIR + +DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile +echo $gpu > gpu + +export ZE_AFFINITY_MASK=$gpu +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \ + numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" +echo $CMD +$CMD & + +done +done + +wait + diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs new file mode 100644 index 00000000..9008a362 --- /dev/null +++ b/systems/Aurora/tests/reproN.pbs @@ -0,0 +1,78 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproN +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +cd $PBS_O_WORKDIR + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + +DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE + +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile + +CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ + ../../gpu_tile_compact.sh \ + ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +$CMD & + +done + +wait + +for n in ` eval echo {1..$NN} ` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE + +cd $DIR + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + +done diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index e5f32ab5..ea37b29e 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -34,6 +34,45 @@ using namespace Grid; #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX #endif +NAMESPACE_BEGIN(Grid); +template + class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase { + public: + Matrix &_Mat; + SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){}; + virtual void Mpc (const Field &in, Field &out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = !in.Checkerboard(); + // std::cout <<" Mpc starting"< HermOpEO(Ddwf); - SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + SchurDiagMooeeOperatorParanoid HermOpEO(Ddwf); + SchurDiagMooeeOperatorParanoid HermOpEO_f(Ddwf_f); int nsecs=600; if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ @@ -144,7 +183,7 @@ int main (int argc, char ** argv) csumref=0; int i=0; do { - if ( iter == 0 ) { + if ( i == 0 ) { SetGridNormLoggingMode(GridNormLoggingModeRecord); } else { SetGridNormLoggingMode(GridNormLoggingModeVerify);