From b2ce760cf4c5ceb7ee537e7b90f871735a65a772 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 25 Feb 2025 16:55:23 +0000 Subject: [PATCH 01/15] Verbose issue with GPT --- Grid/util/Lexicographic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h index 422e42ee..bd73b0d3 100644 --- a/Grid/util/Lexicographic.h +++ b/Grid/util/Lexicographic.h @@ -50,7 +50,7 @@ namespace Grid{ int64_t index64; IndexFromCoorReversed(coor,index64,dims); if ( index64>=2*1024*1024*1024LL ){ - std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "< Date: Tue, 25 Feb 2025 16:57:27 +0000 Subject: [PATCH 02/15] Only throw if there is a pending list entry in CommsComplete --- Grid/communicator/Communicator_none.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index f162a903..3dee8f4d 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } -void CartesianCommunicator::CommsComplete(std::vector &list){ assert(0);} +void CartesianCommunicator::CommsComplete(std::vector &list){ assert(list.size()==0);} void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, From 311e2aab3f48fc76923ae6a25b723de25317396f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 26 Feb 2025 11:42:52 -0500 Subject: [PATCH 03/15] Update Accelerator.h --- Grid/threads/Accelerator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f3371f98..b5aaccb4 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -674,7 +674,7 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) acceleratorCopySynchronise(); } -template void acceleratorPut(T& dev,T&host) +template void acceleratorPut(T& dev,const T&host) { acceleratorCopyToDevice(&host,&dev,sizeof(T)); } From 6ae809ed40937718d17402710f659685914dbbc7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 27 Feb 2025 20:11:45 +0000 Subject: [PATCH 04/15] Print not liked on GPT compile --- Grid/util/Lexicographic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h index bd73b0d3..b4063e48 100644 --- a/Grid/util/Lexicographic.h +++ b/Grid/util/Lexicographic.h @@ -50,7 +50,7 @@ namespace Grid{ int64_t index64; IndexFromCoorReversed(coor,index64,dims); if ( index64>=2*1024*1024*1024LL ){ - std::cout << " IndexFromCoorReversed overflow"< Date: Thu, 27 Feb 2025 20:12:12 +0000 Subject: [PATCH 05/15] Config command that makes GPT happier --- systems/Aurora/config-command | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 6e5512ff..08b77f4f 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,18 +1,19 @@ #Ahead of time compile for PVC -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/" +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC" #JIT compile #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " -../../configure \ +../configure \ --enable-simd=GPU \ --enable-reduction=grid \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ --enable-debug \ + --prefix $HOME/gpt-install \ --disable-gparity \ --disable-fermion-reps \ --with-lime=$CLIME \ From eb8a008a8f1f5f8b8df2fc656faa8bf3b8342ab5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 12:41:59 -0500 Subject: [PATCH 06/15] Create WorkArounds.txt --- systems/WorkArounds.txt | 90 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 systems/WorkArounds.txt diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt new file mode 100644 index 00000000..4d432589 --- /dev/null +++ b/systems/WorkArounds.txt @@ -0,0 +1,90 @@ +The purpose of this file is to collate all non-obvious known magic shell variables +and compiler flags required for either correctness or performance on various systems. + +A repository of work-arounds. + +Contents: +1. Interconnect + MPI +2. Compilation + + +* 1. INTERCONNECT + MPI + +-------------------------------------------------------------------- +MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O +-------------------------------------------------------------------- +export OMPI_MCA_io=romio321 + +-------------------------------------- +ROMIO fail with > 2GB per node read (32 bit issue) +-------------------------------------- + +Use later MPICH + +https://github.com/paboyle/Grid/issues/381 + +https://github.com/pmodels/mpich/commit/3a479ab0 + +-------------------------------------------------------------------- +Slingshot: Frontier and Perlmutter libfabric slow down +and physical memory fragmentation +-------------------------------------------------------------------- +export FI_MR_CACHE_MONITOR=disabled +or +export FI_MR_CACHE_MONITOR=kdreg2 + +-------------------------------------------------------------------- +Mellanox performance with A100 GPU +-------------------------------------------------------------------- +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes + +-------------------------------------------------------------------- +Mellanox + A100 correctness +-------------------------------------------------------------------- +export UCX_MEMTYPE_CACHE=n + +-------------------------------------------------------------------- +MPICH/Aurora/PVC correctness and performance (Peter Boyle) +-------------------------------------------------------------------- + +https://github.com/pmodels/mpich/issues/7302 + +--enable-cuda-aware-mpi=no +(Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI) + +Ideally use MPICH with fix: + +https://github.com/pmodels/mpich/pull/7312 + +Ideally: +MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic + +Alternatives: +export MPIR_CVAR_NOLOCAL=1 +export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000 + + +* 2. COMPILATION + +-------------------------------------------------------------------- +G++ bugs +-------------------------------------------------------------------- + + +-------------------------------------------------------------------- +AMD GPU nodes : multiple ROCM versions broken; use 5.3.0 +-------------------------------------------------------------------- +https://github.com/paboyle/Grid/issues/464 + +-------------------------------------------------------------------- +Aurora/PVC + +SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB) +SYCL slow link and relocatable code issues (Christoph Lehner) +-------------------------------------------------------------------- +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC" From bc12dbbb38570ef4d7bbfd703204426b5179f466 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 12:48:56 -0500 Subject: [PATCH 07/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index 4d432589..b9570cbf 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -71,9 +71,28 @@ export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000 * 2. COMPILATION -------------------------------------------------------------------- -G++ bugs +G++ compiler breakage / graveyard -------------------------------------------------------------------- +9.3.0, 10.3.1, +https://github.com/paboyle/Grid/issues/290 +https://github.com/paboyle/Grid/issues/264 + +Working (-) Broken (X): + +4.9.0 - +4.9.1 - +5.1.0 X +5.2.0 X +5.3.0 X +5.4.0 X +6.1.0 X +6.2.0 X +6.3.0 - +7.1.0 - +8.0.0 (HEAD) - + +https://github.com/paboyle/Grid/issues/100 -------------------------------------------------------------------- AMD GPU nodes : multiple ROCM versions broken; use 5.3.0 From 3624bd3d220a0b7a8a8e7792c3228b5630742843 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 13:45:09 -0500 Subject: [PATCH 08/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 63 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index b9570cbf..96aa602b 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -7,8 +7,9 @@ Contents: 1. Interconnect + MPI 2. Compilation - +************************ * 1. INTERCONNECT + MPI +************************ -------------------------------------------------------------------- MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O @@ -33,6 +34,31 @@ export FI_MR_CACHE_MONITOR=disabled or export FI_MR_CACHE_MONITOR=kdreg2 + +-------------------------------------------------------------------- +Frontier/LumiG +-------------------------------------------------------------------- + +Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU + +cat << EOF > select_gpu +#!/bin/bash +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export GPU_MAP=(0 1 2 3 7 6 5 4) +export NUMA_MAP=(3 3 1 1 2 2 0 0) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +export HIP_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec numactl -m \$NUMA -N \$NUMA \$* +EOF +chmod +x ./select_gpu + +srun ./select_gpu BINARY + + -------------------------------------------------------------------- Mellanox performance with A100 GPU -------------------------------------------------------------------- @@ -54,9 +80,12 @@ MPICH/Aurora/PVC correctness and performance (Peter Boyle) https://github.com/pmodels/mpich/issues/7302 --enable-cuda-aware-mpi=no -(Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI) +--enable-unified=no -Ideally use MPICH with fix: +Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI +Do not use SVM + +Ideally use MPICH with fix to issue 7302: https://github.com/pmodels/mpich/pull/7312 @@ -67,8 +96,24 @@ Alternatives: export MPIR_CVAR_NOLOCAL=1 export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000 +-------------------------------------------------------------------- +MPICH/Aurora/PVC correctness and performance (James Osborne) +-------------------------------------------------------------------- +Broken: +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +This gives good peformance without requiring +--enable-cuda-aware-mpi=no + +But is an open issue reported by James Osborne +https://github.com/pmodels/mpich/issues/7139 + +Possibly resolved but unclear if in the installed software yet. + +************************ * 2. COMPILATION +************************ -------------------------------------------------------------------- G++ compiler breakage / graveyard @@ -95,15 +140,23 @@ Working (-) Broken (X): https://github.com/paboyle/Grid/issues/100 -------------------------------------------------------------------- -AMD GPU nodes : multiple ROCM versions broken; use 5.3.0 +AMD GPU nodes : -------------------------------------------------------------------- + +multiple ROCM versions broken; use 5.3.0 +manifests itself as wrong results in fp32 + https://github.com/paboyle/Grid/issues/464 -------------------------------------------------------------------- Aurora/PVC +-------------------------------------------------------------------- SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB) SYCL slow link and relocatable code issues (Christoph Lehner) --------------------------------------------------------------------- +Opt large register file required for good performance in fp64 + + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC" From 267a39d9433fc2defd143b438812a0d4526613c3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 13:49:43 -0500 Subject: [PATCH 09/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index 96aa602b..1c3b8084 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -34,6 +34,14 @@ export FI_MR_CACHE_MONITOR=disabled or export FI_MR_CACHE_MONITOR=kdreg2 +-------------------------------------------------------------------- +Perlmutter +-------------------------------------------------------------------- + +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_IPC_ENABLED=1 +export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 +export MPICH_GPU_NO_ASYNC_MEMCPY=0 -------------------------------------------------------------------- Frontier/LumiG From 795769c636bc24f72ea8e3b8ee4cd952366a388a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 13:50:41 -0500 Subject: [PATCH 10/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index 1c3b8084..a62ba495 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -68,7 +68,7 @@ srun ./select_gpu BINARY -------------------------------------------------------------------- -Mellanox performance with A100 GPU +Mellanox performance with A100 GPU (Tursa, Booster, Leonardo) -------------------------------------------------------------------- export OMPI_MCA_btl=^uct,openib export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc @@ -77,12 +77,12 @@ export UCX_RNDV_THRESH=16384 export UCX_IB_GPU_DIRECT_RDMA=yes -------------------------------------------------------------------- -Mellanox + A100 correctness +Mellanox + A100 correctness (Tursa, Booster, Leonardo) -------------------------------------------------------------------- export UCX_MEMTYPE_CACHE=n -------------------------------------------------------------------- -MPICH/Aurora/PVC correctness and performance (Peter Boyle) +MPICH/Aurora/PVC correctness and performance -------------------------------------------------------------------- https://github.com/pmodels/mpich/issues/7302 @@ -105,7 +105,7 @@ export MPIR_CVAR_NOLOCAL=1 export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000 -------------------------------------------------------------------- -MPICH/Aurora/PVC correctness and performance (James Osborne) +MPICH/Aurora/PVC correctness and performance -------------------------------------------------------------------- Broken: @@ -114,7 +114,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 This gives good peformance without requiring --enable-cuda-aware-mpi=no -But is an open issue reported by James Osborne +But is an open issue reported by James Osborn https://github.com/pmodels/mpich/issues/7139 Possibly resolved but unclear if in the installed software yet. From e8ff9d8e50ee929dbd9505b0b32b591511ef27d5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 14:00:04 -0500 Subject: [PATCH 11/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index a62ba495..7bae046f 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -168,3 +168,21 @@ Opt large register file required for good performance in fp64 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC" + + +************************ +* 3. Visual profile tools +************************ + +-------------------------------------------------------------------- +Frontier/rocprof +-------------------------------------------------------------------- + +-------------------------------------------------------------------- +Aurora/unitrace +-------------------------------------------------------------------- + + +-------------------------------------------------------------------- +Tursa/nsight-sys +-------------------------------------------------------------------- From ad6db92690f0d22470971115ad9f4c51dea281a0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 14:00:26 -0500 Subject: [PATCH 12/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index 7bae046f..eda07223 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -6,6 +6,7 @@ A repository of work-arounds. Contents: 1. Interconnect + MPI 2. Compilation +3. Profiling ************************ * 1. INTERCONNECT + MPI From a1cdda833fc1652dc8916233ed3f942d3a0337fb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 5 Mar 2025 14:04:23 -0500 Subject: [PATCH 13/15] Update WorkArounds.txt --- systems/WorkArounds.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt index eda07223..7191b4ff 100644 --- a/systems/WorkArounds.txt +++ b/systems/WorkArounds.txt @@ -170,6 +170,23 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC" +-------------------------------------------------------------------- +Aurora/PVC useful extra options +-------------------------------------------------------------------- + +Host only sanitizer: +-Xarch_host -fsanitize=leak +-Xarch_host -fsanitize=address + +Deterministic MPI reduction: +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + + ************************ * 3. Visual profile tools From 1d22841811b3378d38e69809bf8d73e35fbd6451 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 6 Mar 2025 03:19:35 +0000 Subject: [PATCH 14/15] Working on aurora, GPT issue turned up is fixed --- Grid/communicator/Communicator_mpi3.cc | 30 ++++++++++++++++---------- Grid/threads/Accelerator.h | 2 +- Grid/util/Init.cc | 10 ++++++++- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index b667d32e..38b9f9c6 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -746,26 +746,34 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { - // int nreq=list.size(); + acceleratorCopySynchronise(); // Complete all pending copy transfers D2D - // if (nreq==0) return; - // std::vector status(nreq); - // std::vector MpiRequests(nreq); + std::vector status; + std::vector MpiRequests; + + for(int r=0;r0) { + status.resize(MpiRequests.size()); + int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. + assert(ierr==0); + } - // int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. - // assert(ierr==0); - // for(int r=0;rHostBufferFreeAll(); // Clean up the buffer allocs diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index b5aaccb4..28c3aa0a 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -676,7 +676,7 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) template void acceleratorPut(T& dev,const T&host) { - acceleratorCopyToDevice(&host,&dev,sizeof(T)); + acceleratorCopyToDevice((void *)&host,&dev,sizeof(T)); } template T acceleratorGet(T& dev) { diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 1424667e..feb44645 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv) Grid_default_latt, Grid_default_mpi); - + if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){ + std::cout << GridLogMessage <<" Enabling flight recorder " < Date: Thu, 6 Mar 2025 14:18:43 -0500 Subject: [PATCH 15/15] Makinig LLVM happy --- Grid/threads/Accelerator.h | 38 ++++++++--------- Grid/threads/Threads.h | 6 +-- benchmarks/Benchmark_usqcd.cc | 69 +++++++++++++++++-------------- systems/sdcc-genoa/bench.slurm | 32 ++++++++++++++ systems/sdcc-genoa/bench2.slurm | 36 ++++++++++++++++ systems/sdcc-genoa/config-command | 16 +++++++ systems/sdcc-genoa/sourceme.sh | 4 ++ 7 files changed, 147 insertions(+), 54 deletions(-) create mode 100644 systems/sdcc-genoa/bench.slurm create mode 100644 systems/sdcc-genoa/bench2.slurm create mode 100644 systems/sdcc-genoa/config-command create mode 100644 systems/sdcc-genoa/sourceme.sh diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 28c3aa0a..b2a40e7b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); } @@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev) return (ev.get_info() == sycl::info::event_command_status::complete); } -inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} -inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) @@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); } -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); } inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; @@ -583,9 +583,9 @@ inline void acceleratorMem(void) accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } @@ -668,7 +668,7 @@ accelerator_inline void acceleratorFence(void) return; } -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) +inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes) { acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); acceleratorCopySynchronise(); diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 6887134d..cdb4fa62 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -73,9 +73,9 @@ Author: paboyle #define thread_critical DO_PRAGMA(omp critical) #ifdef GRID_OMP -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { - uint64_t *ufrom = (uint64_t *)from; + const uint64_t *ufrom = (const uint64_t *)from; uint64_t *uto = (uint64_t *)to; assert(bytes%8==0); uint64_t words=bytes/8; @@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes) }); } #else -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { bcopy(from,to,bytes); } diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index e400138b..4b50121e 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -492,17 +492,18 @@ public: } FGrid->Barrier(); double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = 500; - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); - double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t ni = 100; + uint64_t no = 50; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); @@ -814,20 +818,21 @@ public: double mf_hi, mf_lo, mf_err; timestat.statistics(t_time); - mf_hi = flops/timestat.min; - mf_lo = flops/timestat.max; + mf_hi = flops/timestat.min*ni; + mf_lo = flops/timestat.max*ni; mf_err= flops/timestat.min * timestat.err/timestat.mean; - mflops = flops/timestat.mean; + mflops = flops/timestat.mean*ni; mflops_all.push_back(mflops); if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops L_list({8,12,16,24}); + std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector clover; diff --git a/systems/sdcc-genoa/bench.slurm b/systems/sdcc-genoa/bench.slurm new file mode 100644 index 00000000..2c7f6c32 --- /dev/null +++ b/systems/sdcc-genoa/bench.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#for vol in 24.24.24.24 32.32.32.32 48.48.48.96 +for vol in 48.48.48.96 +do +srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr +done +#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr +done + diff --git a/systems/sdcc-genoa/bench2.slurm b/systems/sdcc-genoa/bench2.slurm new file mode 100644 index 00000000..be21c816 --- /dev/null +++ b/systems/sdcc-genoa/bench2.slurm @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=2 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +nodes=2 +mpi=1.1.1.2 + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr + +for vol in 64.64.64.128 +do +srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr +done + +done + diff --git a/systems/sdcc-genoa/config-command b/systems/sdcc-genoa/config-command new file mode 100644 index 00000000..d992e1da --- /dev/null +++ b/systems/sdcc-genoa/config-command @@ -0,0 +1,16 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX512 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=clang++ \ +MPICXX=mpicxx \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/sdcc-genoa/sourceme.sh b/systems/sdcc-genoa/sourceme.sh new file mode 100644 index 00000000..4f37888c --- /dev/null +++ b/systems/sdcc-genoa/sourceme.sh @@ -0,0 +1,4 @@ +source $HOME/spack/share/spack/setup-env.sh +spack load llvm@17.0.4 +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH +module load openmpi