From f17b8de907258274c0942568c414810410b970f6 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 7 Mar 2024 15:22:08 +0900 Subject: [PATCH 01/33] fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined --- Grid/util/Init.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 9a0b4376..363d9ef4 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts) } #endif +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// From 2b4399f8b1a76ea38702b7c95276328b0a1a785d Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 7 Mar 2024 15:26:01 +0900 Subject: [PATCH 02/33] more HOST_NAME_MAX fix --- tests/Test_dwf_mixedcg_prec.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index 13cc0bb6..e8d36b7f 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -30,6 +30,10 @@ Author: Peter Boyle using namespace std; using namespace Grid; +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + int main (int argc, char ** argv) { char hostname[HOST_NAME_MAX+1]; From d2242979726d5607f461c95ad2d79de8700c6338 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 Mar 2024 15:15:16 +0000 Subject: [PATCH 03/33] PBS scripts --- systems/Aurora/tests/repro16.pbs | 5 +++-- systems/Aurora/tests/solver/stag16.pbs | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index 28030a3d..c15ced99 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -4,7 +4,7 @@ #PBS -q EarlyAppAccess #PBS -l select=16 -#PBS -l walltime=01:00:00 +#PBS -l walltime=02:00:00 #PBS -A LatticeQCD_aesp_CNDA #export OMP_PROC_BIND=spread @@ -36,5 +36,6 @@ export MPICH_OFI_NIC_POLICY=GPU CMD="mpiexec -np 192 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ - --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 " +#--comms-overlap $CMD diff --git a/systems/Aurora/tests/solver/stag16.pbs b/systems/Aurora/tests/solver/stag16.pbs index 5bfe04a6..ec38fe89 100644 --- a/systems/Aurora/tests/solver/stag16.pbs +++ b/systems/Aurora/tests/solver/stag16.pbs @@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU CMD="mpiexec -np 192 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ - --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap" $CMD From cf8632bbac1520444c75652d3582c4e3d0d13808 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 Mar 2024 15:15:35 +0000 Subject: [PATCH 04/33] Britney test option --- Grid/lattice/Lattice_reduction.h | 4 +- Grid/lattice/Lattice_rng.h | 26 ++---------- Grid/util/Init.cc | 73 +++++++++++++++++++++++++++++++- Grid/util/Init.h | 17 ++++++++ tests/Test_dwf_mixedcg_prec.cc | 10 +++++ 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 3d4c4b03..1e03fad6 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -281,12 +281,14 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & return nrm; } + template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); ComplexD nrm = rankInnerProduct(left,right); - // std::cerr<<"flight log " << std::hexfloat << nrm <<" "<GlobalSum(nrm); + // GridNormLog(real(nrm)); return nrm; } diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 2212abbe..7c6c97de 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -411,7 +411,7 @@ public: std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; SeedFixedIntegers(seeds); } - void SeedFixedIntegers(const std::vector &seeds){ + void SeedFixedIntegers(const std::vector &seeds, int britney=0){ // Everyone generates the same seed_seq based on input seeds CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); @@ -428,7 +428,6 @@ public: // MT implementation does not implement fast discard even though // in principle this is possible //////////////////////////////////////////////// -#if 1 thread_for( lidx, _grid->lSites(), { int gidx; @@ -449,29 +448,12 @@ public: int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; - Skip(_generators[l_idx],gidx); // Skip to next RNG sequence - }); -#else - // Everybody loops over global volume. - thread_for( gidx, _grid->_gsites, { - - // Where is it? - int rank; - int o_idx; - int i_idx; - - Coordinate gcoor; - _grid->GlobalIndexToGlobalCoor(gidx,gcoor); - _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); - - // If this is one of mine we take it - if( rank == _grid->ThisRank() ){ - int l_idx=generator_idx(o_idx,i_idx); - _generators[l_idx] = master_engine; + if ( britney ) { + Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence + } else { Skip(_generators[l_idx],gidx); // Skip to next RNG sequence } }); -#endif #else //////////////////////////////////////////////////////////////// // Machine and thread decomposition dependent seeding is efficient diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 9a0b4376..b47c240c 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -86,11 +86,83 @@ NAMESPACE_BEGIN(Grid); static Coordinate Grid_default_latt; static Coordinate Grid_default_mpi; + +/////////////////////////////////////////////////////// +// Grid Norm logging for repro testing +/////////////////////////////////////////////////////// +int GridNormLoggingMode; +int32_t GridNormLoggingCounter; +std::vector GridNormLogVector; + +void SetGridNormLoggingMode(GridNormLoggingMode_t mode) +{ + switch ( mode ) { + case GridNormLoggingModePrint: + SetGridNormLoggingModePrint(); + break; + case GridNormLoggingModeRecord: + SetGridNormLoggingModeRecord(); + break; + case GridNormLoggingModeVerify: + SetGridNormLoggingModeVerify(); + break; + case GridNormLoggingModeNone: + GridNormLoggingMode = mode; + GridNormLoggingCounter=0; + GridNormLogVector.resize(0); + break; + default: + assert(0); + } +} + +void SetGridNormLoggingModePrint(void) +{ + GridNormLoggingCounter = 0; + GridNormLogVector.resize(0); + GridNormLoggingMode = GridNormLoggingModePrint; +} +void SetGridNormLoggingModeRecord(void) +{ + GridNormLoggingCounter = 0; + GridNormLogVector.resize(0); + GridNormLoggingMode = GridNormLoggingModeRecord; +} +void SetGridNormLoggingModeVerify(void) +{ + GridNormLoggingCounter = 0; + GridNormLoggingMode = GridNormLoggingModeVerify; +} +void GridNormLog(double value) +{ + if(GridNormLoggingMode == GridNormLoggingModePrint) { + std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value < GridNormLogVector; +void SetGridNormLoggingModePrint(void); +void SetGridNormLoggingModeRecord(void); +void SetGridNormLoggingModeVerify(void); +void SetGridNormLoggingMode(GridNormLoggingMode_t mode); +void GridNormLog(double value); + NAMESPACE_END(Grid); diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index 13cc0bb6..fb1fa59a 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -104,6 +104,11 @@ int main (int argc, char ** argv) csumref=0; int iter=0; do { + if ( iter == 0 ) { + SetGridNormLoggingMode(GridNormLoggingModeRecord); + } else { + SetGridNormLoggingMode(GridNormLoggingModeVerify); + } std::cerr << "******************* SINGLE PRECISION SOLVE "< Date: Tue, 12 Mar 2024 16:11:33 +0000 Subject: [PATCH 05/33] Repro test --- systems/Aurora/tests/repro128.pbs | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 systems/Aurora/tests/repro128.pbs diff --git a/systems/Aurora/tests/repro128.pbs b/systems/Aurora/tests/repro128.pbs new file mode 100644 index 00000000..34e2edc5 --- /dev/null +++ b/systems/Aurora/tests/repro128.pbs @@ -0,0 +1,41 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=128 +#PBS -l walltime=02:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +# 12 ppn, 128 nodes, 1536 ranks +CMD="mpiexec -np 1536 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap " +$CMD From 95f3d69cf9e03b804ab0bceb6c1a45df602ebf4e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 Mar 2024 20:09:37 +0000 Subject: [PATCH 06/33] Extra hardware test hook --- Grid/lattice/Lattice_reduction.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 1e03fad6..ad9d9942 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -286,9 +286,10 @@ template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); ComplexD nrm = rankInnerProduct(left,right); - // GridNormLog(real(nrm)); // Could log before and after global sum to distinguish local and MPI + RealD local = real(nrm); + GridNormLog(real(nrm)); // Could log before and after global sum to distinguish local and MPI grid->GlobalSum(nrm); - // GridNormLog(real(nrm)); + GridMPINormLog(local,real(nrm)); return nrm; } From 62e7bf024a95df85adb42481523cb393f0beb833 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 Mar 2024 20:10:04 +0000 Subject: [PATCH 07/33] Updated flight logging for Britney test --- Grid/util/Init.cc | 46 +++++++++++++++++++++++++++++++++++++++++++++- Grid/util/Init.h | 1 + 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index c1466b45..ccc47cc9 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -96,7 +96,9 @@ static Coordinate Grid_default_mpi; /////////////////////////////////////////////////////// int GridNormLoggingMode; int32_t GridNormLoggingCounter; +int32_t GridMPINormLoggingCounter; std::vector GridNormLogVector; +std::vector GridMPINormLogVector; void SetGridNormLoggingMode(GridNormLoggingMode_t mode) { @@ -113,6 +115,7 @@ void SetGridNormLoggingMode(GridNormLoggingMode_t mode) case GridNormLoggingModeNone: GridNormLoggingMode = mode; GridNormLoggingCounter=0; + GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); break; default: @@ -122,19 +125,25 @@ void SetGridNormLoggingMode(GridNormLoggingMode_t mode) void SetGridNormLoggingModePrint(void) { + std::cout << " GridNormLogging Reproducibility logging set to print output " < " < "<< result < "<< result < Date: Tue, 19 Mar 2024 14:28:33 +0000 Subject: [PATCH 08/33] FFTW from OneAPI --- Grid/algorithms/FFT.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 29f0ec4b..2cbc895c 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -29,7 +29,7 @@ Author: Peter Boyle #define _GRID_FFT_H_ #ifdef HAVE_FFTW -#ifdef USE_MKL +#if defined(USE_MKL) || defined(GRID_SYCL) #include #else #include From fab1efb48c97fd5ce4e9b611500e6e7b32718c8a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 19 Mar 2024 14:36:21 +0000 Subject: [PATCH 09/33] More britney logging improvements --- Grid/lattice/Lattice_reduction.h | 4 +- Grid/util/Init.cc | 22 ++++++-- Grid/util/Init.h | 8 +-- systems/Aurora/tests/repro16.pbs | 36 ++++++++++--- systems/Aurora/tests/repro1gpu.pbs | 81 ++++++++++++++++++++++++++++++ systems/Aurora/tests/reproN.pbs | 78 ++++++++++++++++++++++++++++ tests/Test_dwf_mixedcg_prec.cc | 45 +++++++++++++++-- 7 files changed, 253 insertions(+), 21 deletions(-) create mode 100644 systems/Aurora/tests/repro1gpu.pbs create mode 100644 systems/Aurora/tests/reproN.pbs diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index ad9d9942..969a4a10 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -285,9 +285,11 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); + uint32_t csum=0; + // Uint32Checksum(left,csum); ComplexD nrm = rankInnerProduct(left,right); RealD local = real(nrm); - GridNormLog(real(nrm)); // Could log before and after global sum to distinguish local and MPI + GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI grid->GlobalSum(nrm); GridMPINormLog(local,real(nrm)); return nrm; diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index ccc47cc9..18a3d5fe 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -99,6 +99,7 @@ int32_t GridNormLoggingCounter; int32_t GridMPINormLoggingCounter; std::vector GridNormLogVector; std::vector GridMPINormLogVector; +std::vector GridCsumLogVector; void SetGridNormLoggingMode(GridNormLoggingMode_t mode) { @@ -117,6 +118,8 @@ void SetGridNormLoggingMode(GridNormLoggingMode_t mode) GridNormLoggingCounter=0; GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); + GridCsumLogVector.resize(0); + GridMPINormLogVector.resize(0); break; default: assert(0); @@ -129,6 +132,8 @@ void SetGridNormLoggingModePrint(void) GridNormLoggingCounter = 0; GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); + GridCsumLogVector.resize(0); + GridMPINormLogVector.resize(0); GridNormLoggingMode = GridNormLoggingModePrint; } void SetGridNormLoggingModeRecord(void) @@ -137,6 +142,8 @@ void SetGridNormLoggingModeRecord(void) GridNormLoggingCounter = 0; GridMPINormLoggingCounter=0; GridNormLogVector.resize(0); + GridCsumLogVector.resize(0); + GridMPINormLogVector.resize(0); GridNormLoggingMode = GridNormLoggingModeRecord; } void SetGridNormLoggingModeVerify(void) @@ -146,24 +153,29 @@ void SetGridNormLoggingModeVerify(void) GridMPINormLoggingCounter=0; GridNormLoggingMode = GridNormLoggingModeVerify; } -void GridNormLog(double value) +void GridNormLog(double value,uint32_t csum) { if(GridNormLoggingMode == GridNormLoggingModePrint) { - std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value < GridNormLogVector; +//extern int GridNormLoggingMode; +//extern int32_t GridNormLoggingCounter; +//extern std::vector GridNormLogVector; void SetGridNormLoggingModePrint(void); void SetGridNormLoggingModeRecord(void); void SetGridNormLoggingModeVerify(void); void SetGridNormLoggingMode(GridNormLoggingMode_t mode); -void GridNormLog(double value); +void GridNormLog(double value,uint32_t csum); void GridMPINormLog(double lcl, double glbl); NAMESPACE_END(Grid); diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index c15ced99..fa37ae09 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -2,26 +2,39 @@ ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 -#PBS -q EarlyAppAccess -#PBS -l select=16 -#PBS -l walltime=02:00:00 +#PBS -l select=16:system=sunspot,place=scatter #PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=01:00:00 +#PBS -N dwf +#PBS -k doe #export OMP_PROC_BIND=spread #unset OMP_PLACES cd $PBS_O_WORKDIR -source ../sourceme.sh +#source ../sourceme.sh cat $PBS_NODEFILE +#export MPICH_COLL_SYNC=1 +#export MPICH_ENV_DISPLAY=1 +export MPICH_ export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu +#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 @@ -32,10 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU -# 12 ppn, 16 nodes, 192 ranks +DIR=repro.$PBS_JOBID +mkdir $DIR +cd $DIR + CMD="mpiexec -np 192 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ - --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 " + ../gpu_tile_compact.sh \ + ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative" #--comms-overlap $CMD + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs new file mode 100644 index 00000000..3b95b404 --- /dev/null +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -0,0 +1,81 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N repro1gpu +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + + +for g in {0..11} +do +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) + +export numa=${NUMA_MAP[$g]} +export gpu_id=${GPU_MAP[$g]} +export tile_id=${TILE_MAP[$g]} +export gpu=$gpu_id.$tile_id + +cd $PBS_O_WORKDIR + +DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile +echo $gpu > gpu + +export ZE_AFFINITY_MASK=$gpu +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \ + numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" +echo $CMD +$CMD & + +done +done + +wait + diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs new file mode 100644 index 00000000..9008a362 --- /dev/null +++ b/systems/Aurora/tests/reproN.pbs @@ -0,0 +1,78 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproN +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +cd $PBS_O_WORKDIR + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + +DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE + +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile + +CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ + ../../gpu_tile_compact.sh \ + ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +$CMD & + +done + +wait + +for n in ` eval echo {1..$NN} ` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE + +cd $DIR + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + +done diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index e5f32ab5..ea37b29e 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -34,6 +34,45 @@ using namespace Grid; #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX #endif +NAMESPACE_BEGIN(Grid); +template + class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase { + public: + Matrix &_Mat; + SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){}; + virtual void Mpc (const Field &in, Field &out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = !in.Checkerboard(); + // std::cout <<" Mpc starting"< HermOpEO(Ddwf); - SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + SchurDiagMooeeOperatorParanoid HermOpEO(Ddwf); + SchurDiagMooeeOperatorParanoid HermOpEO_f(Ddwf_f); int nsecs=600; if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ @@ -144,7 +183,7 @@ int main (int argc, char ** argv) csumref=0; int i=0; do { - if ( iter == 0 ) { + if ( i == 0 ) { SetGridNormLoggingMode(GridNormLoggingModeRecord); } else { SetGridNormLoggingMode(GridNormLoggingModeVerify); From f6fd6dd053b8663a219a8c740ed01fe02665bb76 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:30:01 +0000 Subject: [PATCH 10/33] Flight recorder, resurrecting the "world famous" Britney test --- Grid/util/FlightRecorder.h | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 Grid/util/FlightRecorder.h diff --git a/Grid/util/FlightRecorder.h b/Grid/util/FlightRecorder.h new file mode 100644 index 00000000..d9f6250a --- /dev/null +++ b/Grid/util/FlightRecorder.h @@ -0,0 +1,43 @@ +#pragma once + +NAMESPACE_BEGIN(Grid); +class FlightRecorder { + public: + enum LoggingMode_t { + LoggingModeNone, + LoggingModePrint, + LoggingModeRecord, + LoggingModeVerify + }; + + static int LoggingMode; + static uint64_t ErrorCounter; + static int32_t XmitLoggingCounter; + static int32_t RecvLoggingCounter; + static int32_t CsumLoggingCounter; + static int32_t NormLoggingCounter; + static int32_t ReductionLoggingCounter; + static std::vector XmitLogVector; + static std::vector RecvLogVector; + static std::vector CsumLogVector; + static std::vector NormLogVector; + static std::vector ReductionLogVector; + static int ContinueOnFail; + static int PrintEntireLog; + static int ChecksumComms; + static int ChecksumCommsSend; + static void SetLoggingModePrint(void); + static void SetLoggingModeRecord(void); + static void SetLoggingModeVerify(void); + static void SetLoggingMode(LoggingMode_t mode); + static void NormLog(double value); + static void CsumLog(uint64_t csum); + static void ReductionLog(double lcl, double glbl); + static void Truncate(void); + static void ResetCounters(void); + static uint64_t ErrorCount(void); + static void xmitLog(void *,uint64_t bytes); + static void recvLog(void *,uint64_t bytes,int rank); +}; +NAMESPACE_END(Grid); + From b92dfcc8d3c33e6b3eab6131e2a066acc50f2b90 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:30:27 +0000 Subject: [PATCH 11/33] Flight recorder, resurrecting the "world famous" Britney test --- Grid/util/FlightRecorder.cc | 339 ++++++++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 Grid/util/FlightRecorder.cc diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc new file mode 100644 index 00000000..4b8e0346 --- /dev/null +++ b/Grid/util/FlightRecorder.cc @@ -0,0 +1,339 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/Init.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////// +// Grid Norm logging for repro testing +/////////////////////////////////////////////////////// +int FlightRecorder::PrintEntireLog; +int FlightRecorder::ContinueOnFail; +int FlightRecorder::LoggingMode; +int FlightRecorder::ChecksumComms; +int FlightRecorder::ChecksumCommsSend; +int32_t FlightRecorder::XmitLoggingCounter; +int32_t FlightRecorder::RecvLoggingCounter; +int32_t FlightRecorder::CsumLoggingCounter; +int32_t FlightRecorder::NormLoggingCounter; +int32_t FlightRecorder::ReductionLoggingCounter; +uint64_t FlightRecorder::ErrorCounter; +std::vector FlightRecorder::NormLogVector; +std::vector FlightRecorder::ReductionLogVector; +std::vector FlightRecorder::CsumLogVector; +std::vector FlightRecorder::XmitLogVector; +std::vector FlightRecorder::RecvLogVector; + +void FlightRecorder::ResetCounters(void) +{ + XmitLoggingCounter=0; + RecvLoggingCounter=0; + CsumLoggingCounter=0; + NormLoggingCounter=0; + ReductionLoggingCounter=0; +} +void FlightRecorder::Truncate(void) +{ + ResetCounters(); + XmitLogVector.resize(0); + RecvLogVector.resize(0); + NormLogVector.resize(0); + CsumLogVector.resize(0); + ReductionLogVector.resize(0); +} +void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) +{ + switch ( mode ) { + case LoggingModePrint: + SetLoggingModePrint(); + break; + case LoggingModeRecord: + SetLoggingModeRecord(); + break; + case LoggingModeVerify: + SetLoggingModeVerify(); + break; + case LoggingModeNone: + LoggingMode = mode; + Truncate(); + break; + default: + assert(0); + } +} + +void FlightRecorder::SetLoggingModePrint(void) +{ + std::cout << " FlightRecorder: set to print output " < " < " < "<< global < dev(1); + acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); + acceleratorCopySynchronise(); + MPI_Barrier(MPI_COMM_WORLD); + } +} +void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) +{ + if ( ChecksumComms ){ + uint64_t *ubuf = (uint64_t *)buf; + if(LoggingMode == LoggingModeNone) return; +#ifdef GRID_SYCL + uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); + if(LoggingMode == LoggingModePrint) { + std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor < Date: Fri, 22 Mar 2024 15:32:26 +0000 Subject: [PATCH 12/33] Flight recorder, resurrecting the "world famous" Britney test --- Grid/util/Init.cc | 123 ---------------------------------------------- 1 file changed, 123 deletions(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 18a3d5fe..62ee670c 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -90,129 +90,6 @@ NAMESPACE_BEGIN(Grid); static Coordinate Grid_default_latt; static Coordinate Grid_default_mpi; - -/////////////////////////////////////////////////////// -// Grid Norm logging for repro testing -/////////////////////////////////////////////////////// -int GridNormLoggingMode; -int32_t GridNormLoggingCounter; -int32_t GridMPINormLoggingCounter; -std::vector GridNormLogVector; -std::vector GridMPINormLogVector; -std::vector GridCsumLogVector; - -void SetGridNormLoggingMode(GridNormLoggingMode_t mode) -{ - switch ( mode ) { - case GridNormLoggingModePrint: - SetGridNormLoggingModePrint(); - break; - case GridNormLoggingModeRecord: - SetGridNormLoggingModeRecord(); - break; - case GridNormLoggingModeVerify: - SetGridNormLoggingModeVerify(); - break; - case GridNormLoggingModeNone: - GridNormLoggingMode = mode; - GridNormLoggingCounter=0; - GridMPINormLoggingCounter=0; - GridNormLogVector.resize(0); - GridCsumLogVector.resize(0); - GridMPINormLogVector.resize(0); - break; - default: - assert(0); - } -} - -void SetGridNormLoggingModePrint(void) -{ - std::cout << " GridNormLogging Reproducibility logging set to print output " < " < "<< result < "<< result < Date: Fri, 22 Mar 2024 15:32:32 +0000 Subject: [PATCH 13/33] Flight recorder, resurrecting the "world famous" Britney test --- Grid/util/Init.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Grid/util/Init.h b/Grid/util/Init.h index 5d5ecd2f..ac929525 100644 --- a/Grid/util/Init.h +++ b/Grid/util/Init.h @@ -70,21 +70,6 @@ void GridParseLayout(char **argv,int argc, void printHash(void); -enum GridNormLoggingMode_t { - GridNormLoggingModeNone, - GridNormLoggingModePrint, - GridNormLoggingModeRecord, - GridNormLoggingModeVerify -}; -//extern int GridNormLoggingMode; -//extern int32_t GridNormLoggingCounter; -//extern std::vector GridNormLogVector; -void SetGridNormLoggingModePrint(void); -void SetGridNormLoggingModeRecord(void); -void SetGridNormLoggingModeVerify(void); -void SetGridNormLoggingMode(GridNormLoggingMode_t mode); -void GridNormLog(double value,uint32_t csum); -void GridMPINormLog(double lcl, double glbl); NAMESPACE_END(Grid); From e49e95b037a9dd499b23669d0b3ca50f98304347 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:39:27 +0000 Subject: [PATCH 14/33] Upgrade of the Britney test with flight recorder and fast xor checksum --- Grid/lattice/Lattice_reduction.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 969a4a10..4e11378d 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -285,13 +285,25 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); - uint32_t csum=0; - // Uint32Checksum(left,csum); + +#ifdef GRID_SYCL + uint64_t csum=0; + if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) + { + // Hack + // Fast integer xor checksum. Can also be used in comms now. + autoView(l_v,left,AcceleratorRead); + Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); + uint64_t *base= (uint64_t *)&l_v[0]; + csum=svm_xor(base,words); + } + FlightRecorder::CsumLog(csum); +#endif ComplexD nrm = rankInnerProduct(left,right); RealD local = real(nrm); - GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI + FlightRecorder::NormLog(real(nrm)); grid->GlobalSum(nrm); - GridMPINormLog(local,real(nrm)); + FlightRecorder::ReductionLog(local,real(nrm)); return nrm; } From 1bd20cd9e8555314390c69876420e4de1f1fda1e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:40:01 +0000 Subject: [PATCH 15/33] FlightRecorder --- Grid/util/Util.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/util/Util.h b/Grid/util/Util.h index 85c7596f..906c46a1 100644 --- a/Grid/util/Util.h +++ b/Grid/util/Util.h @@ -1,6 +1,6 @@ -#ifndef GRID_UTIL_H -#define GRID_UTIL_H +#pragma once #include #include #include -#endif +#include + From a477c25e8c1b3efb3a646ee47cf70cbcc99a9e12 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:42:11 +0000 Subject: [PATCH 16/33] Sunspot repro tests --- systems/Sunspot/tests/repro1gpu.pbs | 81 ++++++++++++++++++++++++ systems/Sunspot/tests/reproN.pbs | 97 +++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 systems/Sunspot/tests/repro1gpu.pbs create mode 100644 systems/Sunspot/tests/reproN.pbs diff --git a/systems/Sunspot/tests/repro1gpu.pbs b/systems/Sunspot/tests/repro1gpu.pbs new file mode 100644 index 00000000..3b95b404 --- /dev/null +++ b/systems/Sunspot/tests/repro1gpu.pbs @@ -0,0 +1,81 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N repro1gpu +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + + +for g in {0..11} +do +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) + +export numa=${NUMA_MAP[$g]} +export gpu_id=${GPU_MAP[$g]} +export tile_id=${TILE_MAP[$g]} +export gpu=$gpu_id.$tile_id + +cd $PBS_O_WORKDIR + +DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile +echo $gpu > gpu + +export ZE_AFFINITY_MASK=$gpu +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \ + numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" +echo $CMD +$CMD & + +done +done + +wait + diff --git a/systems/Sunspot/tests/reproN.pbs b/systems/Sunspot/tests/reproN.pbs new file mode 100644 index 00000000..fde4f3a9 --- /dev/null +++ b/systems/Sunspot/tests/reproN.pbs @@ -0,0 +1,97 @@ +#!/bin/bash + +#PBS -l select=32:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproN +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=1 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +cd $PBS_O_WORKDIR + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE + +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile + +#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ +# ../../gpu_tile_compact.sh \ +# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ +# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ + ../../gpu_tile_compact.sh \ + ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +echo $CMD > command-line +env > environment +$CMD & + +done + +# Suspicious wait is allowing jobs to collide and knock out +#wait + +sleep 6500 + +for n in ` eval echo {1..$NN} ` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE + +cd $DIR + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + +done From d01e5fa83887d5fbf5268b6f67d8499debb2b519 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:42:32 +0000 Subject: [PATCH 17/33] Improved FlightRecorder --- tests/Test_dwf_mixedcg_prec.cc | 51 +++++++++++++--------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index ea37b29e..f7df05b9 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -34,6 +34,7 @@ using namespace Grid; #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX #endif + NAMESPACE_BEGIN(Grid); template class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase { @@ -143,14 +144,21 @@ int main (int argc, char ** argv) time_t start = time(NULL); - uint32_t csum, csumref; - csumref=0; + FlightRecorder::ContinueOnFail = 0; + FlightRecorder::PrintEntireLog = 0; + FlightRecorder::ChecksumComms = 1; + FlightRecorder::ChecksumCommsSend=0; + + if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s); + int iter=0; do { if ( iter == 0 ) { - SetGridNormLoggingMode(GridNormLoggingModeRecord); + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); } else { - SetGridNormLoggingMode(GridNormLoggingModeVerify); + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); } std::cerr << "******************* SINGLE PRECISION SOLVE "<gSites()*iters; std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - csumref=0; int i=0; do { if ( i == 0 ) { - SetGridNormLoggingMode(GridNormLoggingModeRecord); + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); } else { - SetGridNormLoggingMode(GridNormLoggingModeVerify); + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); } std::cerr << "******************* DOUBLE PRECISION SOLVE "< Date: Fri, 22 Mar 2024 15:42:57 +0000 Subject: [PATCH 18/33] Xor csum for repro testing --- Grid/lattice/Lattice_reduction_sycl.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 90980c4c..8395eb7c 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite return result; } -NAMESPACE_END(Grid); -/* -template Double svm_reduce(Double *vec,uint64_t L) +template Word svm_xor(Word *vec,uint64_t L) { - Double sumResult; zeroit(sumResult); - Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); - Double identity; zeroit(identity); + Word xorResult; xorResult = 0; + Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); + Word identity; identity=0; theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); + auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); cgh.parallel_for(cl::sycl::range<1>{L}, Reduction, [=] (cl::sycl::id<1> index, auto &sum) { - sum +=vec[index]; + sum ^=vec[index]; }); }); theGridAccelerator->wait(); - Double ret = d_sum[0]; + Word ret = d_sum[0]; free(d_sum,*theGridAccelerator); - std::cout << " svm_reduce finished "< command-line +env > environment $CMD & done -wait +# Suspicious wait is allowing jobs to collide and knock out +#wait + +sleep 6500 for n in ` eval echo {1..$NN} ` do THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` -DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE cd $DIR From 4b87259c1bebbb5fcbfe27bd51bee3449f239fbc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:43:49 +0000 Subject: [PATCH 20/33] New config command for sunspot --- systems/Sunspot/config-command | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/systems/Sunspot/config-command b/systems/Sunspot/config-command index e59ef515..dbfe43c1 100644 --- a/systems/Sunspot/config-command +++ b/systems/Sunspot/config-command @@ -1,4 +1,4 @@ -TOOLS=$HOME/tools + ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" From 500b119f3de4cf6b8e9b995f135663f307d113aa Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 22 Mar 2024 15:55:23 +0000 Subject: [PATCH 21/33] Deterministic MPI --- systems/Sunspot/sourceme.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 systems/Sunspot/sourceme.sh diff --git a/systems/Sunspot/sourceme.sh b/systems/Sunspot/sourceme.sh new file mode 100644 index 00000000..b6bbd561 --- /dev/null +++ b/systems/Sunspot/sourceme.sh @@ -0,0 +1,2 @@ +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu From 434c3e7f1d7e33492ad2ac7e1a4d8779050b550c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 Mar 2024 14:32:44 +0000 Subject: [PATCH 22/33] We have a choice of GET or PUT across NVlink --- Grid/communicator/Communicator_mpi3.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index e7d7a96d..89b042e9 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, return offbytes; } +#undef NVLINK_GET // Define to use get instead of put DMA double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest,int dox, @@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(from,xmit); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); +#endif } if (dox) { + // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32; ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); @@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); +#endif + } } @@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); From 1f53458af88869b3537b7c7d5f164f8e476658c4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 00:37:19 +0000 Subject: [PATCH 23/33] Options to bounce through a host buffer if --disable-accelerator-aware-mpi --- Grid/communicator/SharedMemory.cc | 21 ++++++++++++ Grid/communicator/SharedMemory.h | 16 +++++++-- Grid/communicator/SharedMemoryMPI.cc | 50 +++++----------------------- 3 files changed, 44 insertions(+), 43 deletions(-) diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc index ec42dd87..7f2bd324 100644 --- a/Grid/communicator/SharedMemory.cc +++ b/Grid/communicator/SharedMemory.cc @@ -40,6 +40,7 @@ int GlobalSharedMemory::_ShmAlloc; uint64_t GlobalSharedMemory::_ShmAllocBytes; std::vector GlobalSharedMemory::WorldShmCommBufs; +void * GlobalSharedMemory::HostCommBuf; Grid_MPI_Comm GlobalSharedMemory::WorldShmComm; int GlobalSharedMemory::WorldShmRank; @@ -66,6 +67,26 @@ void GlobalSharedMemory::SharedMemoryFree(void) ///////////////////////////////// // Alloc, free shmem region ///////////////////////////////// +#ifndef ACCELERATOR_AWARE_MPI +void *SharedMemory::HostBufferMalloc(size_t bytes){ + void *ptr = (void *)host_heap_top; + host_heap_top += bytes; + host_heap_bytes+= bytes; + if (host_heap_bytes >= host_heap_size) { + std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm flag" < WorldShmCommBufs; - +#ifndef ACCELERATOR_AWARE_MPI + static void *HostCommBuf; +#endif static Grid_MPI_Comm WorldComm; static int WorldRank; static int WorldSize; @@ -120,6 +122,13 @@ private: size_t heap_bytes; size_t heap_size; +#ifndef ACCELERATOR_AWARE_MPI + size_t host_heap_top; // set in free all + size_t host_heap_bytes;// set in free all + void *HostCommBuf; // set in SetCommunicator + size_t host_heap_size; // set in SetCommunicator +#endif + protected: Grid_MPI_Comm ShmComm; // for barriers @@ -151,7 +160,10 @@ public: void *ShmBufferTranslate(int rank,void * local_p); void *ShmBufferMalloc(size_t bytes); void ShmBufferFreeAll(void) ; - +#ifndef ACCELERATOR_AWARE_MPI + void *HostBufferMalloc(size_t bytes); + void HostBufferFreeAll(void); +#endif ////////////////////////////////////////////////////////////////////////// // Make info on Nodes & ranks and Shared memory available ////////////////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 64a86c4b..01921f94 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -512,46 +512,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// #if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) - -//if defined(GRID_SYCL) -#if 0 -void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) -{ - void * ShmCommBuf ; - assert(_ShmSetup==1); - assert(_ShmAlloc==0); - - ////////////////////////////////////////////////////////////////////////////////////////////////////////// - // allocate the pointer array for shared windows for our group - ////////////////////////////////////////////////////////////////////////////////////////////////////////// - MPI_Barrier(WorldShmComm); - WorldShmCommBufs.resize(WorldShmSize); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Each MPI rank should allocate our own buffer - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - ShmCommBuf = acceleratorAllocDevice(bytes); - - if (ShmCommBuf == (void *)NULL ) { - std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; - exit(EXIT_FAILURE); - } - - std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes - << "bytes at "<< std::hex<< ShmCommBuf < Date: Tue, 26 Mar 2024 00:38:41 +0000 Subject: [PATCH 24/33] Merge needs a fence on SYCL --- .../action/fermion/implementation/WilsonKernelsImplementation.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 90bee389..90defc54 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -462,6 +462,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -495,6 +496,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA From f32c275376559ef4380d2649e9458ecdc7562e1b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 00:42:00 +0000 Subject: [PATCH 25/33] Updated config options for MPI not being aware of GPU --- configure.ac | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/configure.ac b/configure.ac index c16d90f6..8e8d67af 100644 --- a/configure.ac +++ b/configure.ac @@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in esac ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons -AC_ARG_ENABLE([accelerator-cshift], - [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])], - [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) +AC_ARG_ENABLE([accelerator-aware-mpi], + [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], + [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) -AC_ARG_ENABLE([ucx-buggy], - [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])], - [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) - -case ${ac_UCXBUGGY} in +case ${ac_ACCELERATOR_AWARE_MPI} in yes) - ac_ACC_CSHIFT=no;; - *);; -esac - -case ${ac_ACC_CSHIFT} in - yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; + AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) + AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; *);; esac From 59b0cc11dfa394b3eaf3c96e455a27c802b6c9f6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 00:42:40 +0000 Subject: [PATCH 26/33] REduce the time in single --- tests/Test_dwf_mixedcg_prec.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index f7df05b9..3d21aff4 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -176,7 +176,7 @@ int main (int argc, char ** argv) std::cout << " FlightRecorder is OK! "< CG(1.0e-8,10000); From 93769eacd3e072c178b4c91c59f7d4206fa221e8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 14:10:24 +0000 Subject: [PATCH 27/33] Updated configure for bounce through host --- systems/Aurora/config-command | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 689747c9..678acb4b 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,16 +1,16 @@ -TOOLS=$HOME/tools + ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ - --enable-accelerator-cshift \ --disable-gparity \ --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ + --enable-accelerator-aware-mpi=no\ --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" From f7b8163016fefea5de04b483ad6369ed6464170e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 14:11:40 +0000 Subject: [PATCH 28/33] Deterministic MPI reduce options --- systems/Aurora/sourceme-sunspot-deterministic.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 systems/Aurora/sourceme-sunspot-deterministic.sh diff --git a/systems/Aurora/sourceme-sunspot-deterministic.sh b/systems/Aurora/sourceme-sunspot-deterministic.sh new file mode 100644 index 00000000..b6bbd561 --- /dev/null +++ b/systems/Aurora/sourceme-sunspot-deterministic.sh @@ -0,0 +1,2 @@ +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu From 49e9e4ed0ea9bd7721bcbedb7dca0a3641711b5f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 14:14:06 +0000 Subject: [PATCH 29/33] Fences --- Grid/stencil/Stencil.h | 133 +++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 79 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index ef3aa821..80acb4ae 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -70,57 +70,6 @@ struct DefaultImplParams { void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); -/* -template -void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); - -template -void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) -{ - int num=table.size(); - std::pair *table_v = & table[0]; - - auto rhs_v = rhs.View(AcceleratorRead); - accelerator_forNB( i,num, vobj::Nsimd(), { - compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); - }); - rhs_v.ViewClose(); -} - -/////////////////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split with compression -/////////////////////////////////////////////////////////////////// -template -void Gather_plane_exchange_table(const Lattice &rhs, - commVector pointers, - int dimension,int plane, - int cbmask,compressor &compress,int type) __attribute__((noinline)); - -template -void Gather_plane_exchange_table(commVector >& table, - const Lattice &rhs, - std::vector &pointers,int dimension,int plane,int cbmask, - compressor &compress,int type) -{ - assert( (table.size()&0x1)==0); - int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - - auto rhs_v = rhs.View(AcceleratorRead); - auto rhs_p = &rhs_v[0]; - auto p0=&pointers[0][0]; - auto p1=&pointers[1][0]; - auto tp=&table[0]; - accelerator_forNB(j, num, vobj::Nsimd(), { - compress.CompressExchange(p0,p1, rhs_p, j, - so+tp[2*j ].second, - so+tp[2*j+1].second, - type); - }); - rhs_v.ViewClose(); -} -*/ - void DslashResetCounts(void); void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); void DslashLogFull(void); @@ -258,6 +207,10 @@ public: struct Packet { void * send_buf; void * recv_buf; +#ifndef ACCELERATOR_AWARE_MPI + void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE + void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE +#endif Integer to_rank; Integer from_rank; Integer do_send; @@ -324,7 +277,7 @@ public: Vector surface_list; stencilVector _entries; // Resident in managed memory - commVector _entries_device; // Resident in managed memory + commVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -408,33 +361,16 @@ public: // Use OpenMP Tasks for cleaner ??? // must be called *inside* parallel region ////////////////////////////////////////// - /* - void CommunicateThreaded() - { -#ifdef GRID_OMP - int mythread = omp_get_thread_num(); - int nthreads = CartesianCommunicator::nCommThreads; -#else - int mythread = 0; - int nthreads = 1; -#endif - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - } - } - } - */ //////////////////////////////////////////////////////////////////////// // Non blocking send and receive. Necessarily parallel. //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // All GPU kernel tasks must complete + // accelerator_barrier(); // All kernels should ALREADY be complete + // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + // But the HaloGather had a barrier too. +#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -443,16 +379,54 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } +#else +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } +#endif + // Get comms started then run checksums + // Having this PRIOR to the dslash seems to make Sunspot work... (!) + for(int i=0;i > &reqs) { - _grid->StencilSendToRecvFromComplete(MpiReqs,0); + _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - acceleratorCopySynchronise(); + // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete + // accelerator_barrier(); _grid->StencilBarrier(); +#ifndef ACCELERATOR_AWARE_MPI +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferFreeAll(); +#endif + // run any checksums + for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { + // accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -540,10 +515,9 @@ public: compress.Point(point); HaloGatherDir(source,compress,point,face_idx); } - accelerator_barrier(); + accelerator_barrier(); // All my local gathers are complete face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); - } ///////////////////////// @@ -579,6 +553,7 @@ public: accelerator_forNB(j, words, cobj::Nsimd(), { coalescedWrite(to[j] ,coalescedRead(from [j])); }); + acceleratorFenceComputeStream(); } } @@ -669,6 +644,7 @@ public: for(int i=0;i Date: Tue, 26 Mar 2024 14:41:25 +0000 Subject: [PATCH 30/33] Acclerator ware MPI guard on the Unix domain sockets --- Grid/communicator/SharedMemoryMPI.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 01921f94..2600ce9c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -39,9 +39,11 @@ Author: Christoph Lehner #include #endif #ifdef GRID_SYCL +#ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC +#define SHM_SOCKETS +#endif #include -#define SHM_SOCKETS #endif #include From 3ef2a41518c0819867e1d83e97e427998ccd10cd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 14:50:32 +0000 Subject: [PATCH 31/33] ifdef guard ommitted --- Grid/communicator/SharedMemory.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc index 7f2bd324..3445b077 100644 --- a/Grid/communicator/SharedMemory.cc +++ b/Grid/communicator/SharedMemory.cc @@ -40,7 +40,9 @@ int GlobalSharedMemory::_ShmAlloc; uint64_t GlobalSharedMemory::_ShmAllocBytes; std::vector GlobalSharedMemory::WorldShmCommBufs; +#ifndef ACCELERATOR_AWARE_MPI void * GlobalSharedMemory::HostCommBuf; +#endif Grid_MPI_Comm GlobalSharedMemory::WorldShmComm; int GlobalSharedMemory::WorldShmRank; From da593796123f99307b486350f8b2ef6ae7d2c375 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 Mar 2024 17:03:20 +0000 Subject: [PATCH 32/33] Large reg file for double --- systems/Aurora/tests/repro1gpu.pbs | 1 + systems/Aurora/tests/reproN.pbs | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs index 3b95b404..283a9343 100644 --- a/systems/Aurora/tests/repro1gpu.pbs +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -30,6 +30,7 @@ export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" cd $PBS_O_WORKDIR diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs index fde4f3a9..293e7ade 100644 --- a/systems/Aurora/tests/reproN.pbs +++ b/systems/Aurora/tests/reproN.pbs @@ -26,9 +26,10 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export GRID_PRINT_ENTIRE_LOG=0 -export GRID_CHECKSUM_RECV_BUF=1 +export GRID_CHECKSUM_RECV_BUF=0 export GRID_CHECKSUM_SEND_BUF=0 export MPICH_OFI_NIC_POLICY=GPU From ff2ea5de181f8ef75ee7129206e06edeae9bb5d6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Apr 2024 14:25:45 -0400 Subject: [PATCH 33/33] Update Tensor_traits.h --- Grid/tensors/Tensor_traits.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 536e17f1..8bfdedcf 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid); NAMESPACE_END(Grid); -#ifdef GRID_SYCL -template struct -sycl::is_device_copyable::value && (!std::is_trivially_copyable::value), - void>::type> - : public std::true_type {}; -#endif