diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 0d9a8874..95823de5 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -210,8 +210,8 @@ void acceleratorInit(void) cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); - // theCopyAccelerator = new sycl::queue (selectedDevice); - theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. + theCopyAccelerator = new sycl::queue (selectedDevice); + // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 8e805575..60d18fb6 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global) } void FlightRecorder::xmitLog(void *buf,uint64_t bytes) { + if(LoggingMode == LoggingModeNone) return; + if ( ChecksumCommsSend ){ uint64_t *ubuf = (uint64_t *)buf; if(LoggingMode == LoggingModeNone) return; + #ifdef GRID_SYCL uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); if(LoggingMode == LoggingModePrint) { diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs new file mode 100644 index 00000000..49bc0b24 --- /dev/null +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -0,0 +1,67 @@ +#!/bin/bash + +#PBS -q debug +#PBS -l select=1 +#PBS -l walltime=00:20:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh +module load pti-gpu + +#cat $PBS_NODEFILE + +export OMP_NUM_THREADS=4 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 2 nodes, 24 ranks +# +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +#$CMD | tee 1node.comms + + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " +#$CMD | tee 1tile.dwf + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.32.32.32.48.dwf + + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 1node.64.64.32.96.dwf + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 1node.64.32.32.48.dwf + diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench2.pbs similarity index 52% rename from systems/Aurora/benchmarks/bench12.pbs rename to systems/Aurora/benchmarks/bench2.pbs index ee3cb381..ea469cda 100644 --- a/systems/Aurora/benchmarks/bench12.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -1,10 +1,8 @@ #!/bin/bash -## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 - -#PBS -q EarlyAppAccess +#PBS -q workq #PBS -l select=2 -#PBS -l walltime=01:00:00 +#PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA #export OMP_PROC_BIND=spread @@ -13,11 +11,13 @@ cd $PBS_O_WORKDIR source ../sourceme.sh +module load pti-gpu -export OMP_NUM_THREADS=3 +#cat $PBS_NODEFILE + +export OMP_NUM_THREADS=4 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 - #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST @@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU +# 12 ppn, 2 nodes, 24 ranks +# CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ - ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +$CMD | tee 2node.comms -#$CMD CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.32.32.64.48.dwf -#$CMD -CMD="mpiexec -np 1 -ppn 1 -envall \ +CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ - ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.64.64.64.96.dwf -$CMD - -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" - -$CMD diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh index 5cab1ee3..099a0ded 100755 --- a/systems/Aurora/benchmarks/gpu_tile_compact.sh +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -1,33 +1,34 @@ #!/bin/bash -export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) -#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) -export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) -export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) -export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) -export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); +#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) + +export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); +export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} -export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} -export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} -export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} -#export GRID_MPICH_NIC_BIND=$NIC -#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id - unset EnableWalkerPartition export EnableImplicitScaling=0 -export ZE_AFFINITY_MASK=$gpu_id.$tile_id +export ZE_AFFINITY_MASK=$gpu_id export ONEAPI_DEVICE_FILTER=gpu,level_zero -#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 -#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 -#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 -#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " -numactl -m $NUMA -N $NUMAP "$@" +if [ $PALS_RANKID = "0" ] +then +# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@" +# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" + numactl -m $NUMA -N $NUMA "$@" +else + numactl -m $NUMA -N $NUMA "$@" +fi diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 678acb4b..58eb8a03 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -7,7 +7,7 @@ --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-accelerator-aware-mpi=no\ + --enable-accelerator-aware-mpi=yes\ --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 8951e43c..b43b3b71 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,7 +1,9 @@ #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -module use /soft/modulefiles -module load intel_compute_runtime/release/agama-devel-682.22 +module load oneapi/release/2023.12.15.001 + +#module use /soft/modulefiles +#module load intel_compute_runtime/release/agama-devel-682.22 export FI_CXI_DEFAULT_CQ_SIZE=131072 export FI_CXI_CQ_FILL_PERCENT=20