From a77cd50b2fe0e11b067a5883a109ff6a4782a514 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 11 Jul 2025 14:36:10 +0000 Subject: [PATCH] Update comms logging in Cshift --- Grid/cshift/Cshift_mpi.h | 17 +++++++++++++++ Grid/log/Log.cc | 3 +++ Grid/log/Log.h | 1 + benchmarks/Benchmark_comms.cc | 3 ++- systems/Aurora/benchmarks/bench2.pbs | 23 +++++++-------------- systems/Aurora/config-command | 31 ++++++++++++++-------------- systems/Aurora/sourceme.sh | 17 +++++++-------- 7 files changed, 54 insertions(+), 41 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 05ee946b..fc0cc2eb 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -186,6 +186,14 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); + std::cout << GridLogComms<< " Cshift: " + <<" dim"<ThisRank() + <<" Coor "<ThisProcessorCoor() + <<" send "< void Cshift_comms_simd(Lattice &ret,const LatticeThisRank() + <<" Coor "<ThisProcessorCoor() + <<" send "< &logstreams) { GridLogDebug.Active(0); GridLogPerformance.Active(0); GridLogDslash.Active(0); + GridLogComms.Active(0); GridLogIntegrator.Active(1); GridLogColours.Active(0); GridLogHMC.Active(1); @@ -97,6 +99,7 @@ void GridLogConfigure(std::vector &logstreams) { if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); + if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 370b0428..ec6becd6 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -180,6 +180,7 @@ extern GridLogger GridLogError; extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; extern GridLogger GridLogDebug; +extern GridLogger GridLogComms; extern GridLogger GridLogPerformance; extern GridLogger GridLogDslash; extern GridLogger GridLogIterative; diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 6696c8eb..7755635a 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -219,7 +219,8 @@ int main (int argc, char ** argv) int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } - int tid = omp_get_thread_num(); + // int tid = omp_get_thread_num(); + int tid = 0; tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index aebed04e..3c74b6bc 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -1,7 +1,8 @@ #!/bin/bash -##PBS -q EarlyAppAccess #PBS -q debug +#PBS -l filesystems=flare +#PBS -l filesystems=home #PBS -l select=2 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA @@ -14,26 +15,18 @@ cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 export MPICH_OFI_NIC_POLICY=GPU - -#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE -#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 - +export MPICH_CH4_SHM=XPMEM +export MPIR_CVAR_DEBUG_SUMMARY=1 +export MPICH_DBG_LEVEL=VERBOSE +export MPICH_DBG_CLASS=ALL # # Local vol 16.16.16.32 # #VOL=32.64.64.96 +mpiexec -np 1 -ppn 1 -envall mpivars -for VOL in 32.32.32.96 32.64.64.96 +for VOL in 32.32.32.96 do for AT in 32 do diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 08b77f4f..f1634665 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,26 +1,27 @@ -#Ahead of time compile for PVC +export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' ` +export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' ` +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` +export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2}' ` -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC" - -#JIT compile -#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " -#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " - -../configure \ +../../configure \ --enable-simd=GPU \ - --enable-reduction=grid \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ - --enable-debug \ - --prefix $HOME/gpt-install \ --disable-gparity \ --disable-fermion-reps \ - --with-lime=$CLIME \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-accelerator-aware-mpi=no\ + --enable-accelerator-aware-mpi=no \ --enable-unified=no \ + --enable-debug \ + --with-lime=$CLIME \ + --with-gmp=$GMP \ + --with-mpfr=$MPFR \ + --with-unwind=$UNWIND \ MPICXX=mpicxx \ - CXX=icpx + CXX=icpx \ + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc -lnuma" \ + CXXFLAGS="-fPIC -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel" + + diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 89126f5b..4009609c 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,16 +1,13 @@ -#module load oneapi/release/2023.12.15.001 -#module load mpich/icc-all-debug-pmix-gpu/52.2 -#module load mpich-config/mode/deterministic -#module load intel_compute_runtime/release/821.35 -module load pti-gpu - -source ~/spack/share/spack/setup-env.sh -spack load c-lime -spack load openssl -export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 git config --global http.proxy http://proxy.alcf.anl.gov:3128 + +source ~/spack/share/spack/setup-env.sh +spack load c-lime +spack load openssl@3.3.1%gcc@12.2.0 +spack load unwind +export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2}' ` +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"