From 8a098889fccb9916d30f95706c4af896e401b21f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 30 Apr 2024 21:15:08 +0100 Subject: [PATCH 1/8] Update FlightRecorder.cc --- Grid/util/FlightRecorder.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 4b8e0346..32fcd48b 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -290,7 +290,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) deviceVector dev(1); acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); acceleratorCopySynchronise(); +#ifndef GRID_COMMS_NONE MPI_Barrier(MPI_COMM_WORLD); +#endif } } void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) From 24602e1259e9c63260e2fe3eafc738f0f6b22211 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 17:28:38 +0000 Subject: [PATCH 2/8] Accidental synchronise --- Grid/util/FlightRecorder.cc | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 32fcd48b..60d18fb6 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global) } void FlightRecorder::xmitLog(void *buf,uint64_t bytes) { + if(LoggingMode == LoggingModeNone) return; + if ( ChecksumCommsSend ){ uint64_t *ubuf = (uint64_t *)buf; if(LoggingMode == LoggingModeNone) return; + #ifdef GRID_SYCL uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); if(LoggingMode == LoggingModePrint) { @@ -284,14 +287,6 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) } XmitLoggingCounter++; } -#endif - } else { - uint64_t word = 1; - deviceVector dev(1); - acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); - acceleratorCopySynchronise(); -#ifndef GRID_COMMS_NONE - MPI_Barrier(MPI_COMM_WORLD); #endif } } From cd52e3cbc2567d8df96a165efb1a2967759163eb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:38:15 +0000 Subject: [PATCH 3/8] Jobs on subspot --- systems/Aurora/benchmarks/bench1.pbs | 67 ++++++++++++++++++++++++++++ systems/Aurora/benchmarks/bench2.pbs | 55 +++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench1.pbs create mode 100644 systems/Aurora/benchmarks/bench2.pbs diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs new file mode 100644 index 00000000..49bc0b24 --- /dev/null +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -0,0 +1,67 @@ +#!/bin/bash + +#PBS -q debug +#PBS -l select=1 +#PBS -l walltime=00:20:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh +module load pti-gpu + +#cat $PBS_NODEFILE + +export OMP_NUM_THREADS=4 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 2 nodes, 24 ranks +# +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +#$CMD | tee 1node.comms + + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " +#$CMD | tee 1tile.dwf + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.32.32.32.48.dwf + + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 1node.64.64.32.96.dwf + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 1node.64.32.32.48.dwf + diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs new file mode 100644 index 00000000..ea469cda --- /dev/null +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -0,0 +1,55 @@ +#!/bin/bash + +#PBS -q workq +#PBS -l select=2 +#PBS -l walltime=00:20:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh +module load pti-gpu + +#cat $PBS_NODEFILE + +export OMP_NUM_THREADS=4 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 2 nodes, 24 ranks +# +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +$CMD | tee 2node.comms + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.32.32.64.48.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.64.64.64.96.dwf + From 057f86c1de80febeaccf66776a806ab3652de6a7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:42:50 +0000 Subject: [PATCH 4/8] 2 queues works ok in performance --- Grid/threads/Accelerator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 19411b62..13466ca0 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -210,8 +210,8 @@ void acceleratorInit(void) cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); - // theCopyAccelerator = new sycl::queue (selectedDevice); - theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. + theCopyAccelerator = new sycl::queue (selectedDevice); + // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); From 5c4c9f721a6bf0faa5b751219b238cb7c110bf87 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:44:49 +0000 Subject: [PATCH 5/8] Remove pbs file and replace with bench1 and bench2 for 1 and 2 nodes --- systems/Aurora/benchmarks/bench12.pbs | 60 --------------------------- 1 file changed, 60 deletions(-) delete mode 100644 systems/Aurora/benchmarks/bench12.pbs diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs deleted file mode 100644 index ee3cb381..00000000 --- a/systems/Aurora/benchmarks/bench12.pbs +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 - -#PBS -q EarlyAppAccess -#PBS -l select=2 -#PBS -l walltime=01:00:00 -#PBS -A LatticeQCD_aesp_CNDA - -#export OMP_PROC_BIND=spread -#unset OMP_PLACES - -cd $PBS_O_WORKDIR - -source ../sourceme.sh - -export OMP_NUM_THREADS=3 -export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 - - -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE -#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST - -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 -export MPICH_OFI_NIC_POLICY=GPU - -CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" - -#$CMD - -CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" - -#$CMD - -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" - -$CMD - -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" - -$CMD From c29322810290daac38813da5c1fc1fdeba81efdd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:45:21 +0000 Subject: [PATCH 6/8] layout control --- systems/Aurora/benchmarks/gpu_tile_compact.sh | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh index 5cab1ee3..099a0ded 100755 --- a/systems/Aurora/benchmarks/gpu_tile_compact.sh +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -1,33 +1,34 @@ #!/bin/bash -export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) -#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) -export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) -export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) -export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) -export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); +#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) + +export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); +export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} -export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} -export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} -export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} -#export GRID_MPICH_NIC_BIND=$NIC -#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id - unset EnableWalkerPartition export EnableImplicitScaling=0 -export ZE_AFFINITY_MASK=$gpu_id.$tile_id +export ZE_AFFINITY_MASK=$gpu_id export ONEAPI_DEVICE_FILTER=gpu,level_zero -#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 -#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 -#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 -#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " -numactl -m $NUMA -N $NUMAP "$@" +if [ $PALS_RANKID = "0" ] +then +# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@" +# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" + numactl -m $NUMA -N $NUMA "$@" +else + numactl -m $NUMA -N $NUMA "$@" +fi From 7aa12b446f22d3679e4895d9c02cd4feb4c51cf5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:45:40 +0000 Subject: [PATCH 7/8] New config command for sunspot --- systems/Aurora/config-command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 678acb4b..58eb8a03 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -7,7 +7,7 @@ --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-accelerator-aware-mpi=no\ + --enable-accelerator-aware-mpi=yes\ --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ From ccf147d6c19e10a8a1640727ec939196917c468c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 May 2024 18:45:56 +0000 Subject: [PATCH 8/8] Select the compiler that gives better performance on sunspot --- systems/Aurora/sourceme.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 8951e43c..b43b3b71 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,7 +1,9 @@ #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -module use /soft/modulefiles -module load intel_compute_runtime/release/agama-devel-682.22 +module load oneapi/release/2023.12.15.001 + +#module use /soft/modulefiles +#module load intel_compute_runtime/release/agama-devel-682.22 export FI_CXI_DEFAULT_CQ_SIZE=131072 export FI_CXI_CQ_FILL_PERCENT=20