diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs index 88f0100a..2e99ae4b 100644 --- a/systems/Aurora/benchmarks/bench1024.pbs +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU +export FI_CXI_CQ_FILL_PERCENT=10 +export FI_CXI_DEFAULT_CQ_SIZE=262144 +#export FI_CXI_DEFAULT_CQ_SIZE=131072 +#export FI_CXI_CQ_FILL_PERCENT=20 # 12 ppn, 32 nodes, 384 ranks # @@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 1024node.dwf.small +$CMD | tee 1024node.dwf.small.cq CMD="mpiexec -np 12288 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 1024node.dwf +$CMD | tee 1024node.dwf.cq diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs index 96f6143f..ee3cb381 100644 --- a/systems/Aurora/benchmarks/bench12.pbs +++ b/systems/Aurora/benchmarks/bench12.pbs @@ -17,6 +17,7 @@ source ../sourceme.sh export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST @@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12 -envall \ ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD +#$CMD CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +#$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + $CMD diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index e59ef515..689747c9 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7a2b3815..effb2d5d 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -3,6 +3,19 @@ module use /soft/modulefiles module load intel_compute_runtime/release/agama-devel-682.22 +export FI_CXI_DEFAULT_CQ_SIZE=131072 +export FI_CXI_CQ_FILL_PERCENT=20 + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" +#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" + +# +# -ftarget-register-alloc-mode=pvc:default +# -ftarget-register-alloc-mode=pvc:small +# -ftarget-register-alloc-mode=pvc:large +# -ftarget-register-alloc-mode=pvc:auto +# + export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128