mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Significantly better performance on Aurora without using pipeline mode
This commit is contained in:
@ -27,10 +27,22 @@ export MPICH_OFI_NIC_POLICY=GPU
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
|
||||
#
|
||||
# Local vol 16.16.16.32
|
||||
#
|
||||
|
||||
#VOL=32.64.64.96
|
||||
|
||||
for VOL in 32.32.32.96 32.64.64.96
|
||||
do
|
||||
for AT in 32
|
||||
do
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "
|
||||
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
|
||||
|
||||
echo $CMD
|
||||
$CMD
|
||||
done
|
||||
done
|
||||
|
||||
|
@ -5,11 +5,11 @@
|
||||
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
|
||||
|
||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
|
||||
export NUMA_MMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
|
||||
export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
|
||||
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
|
||||
|
||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMAM=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
|
||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||
|
||||
unset EnableWalkerPartition
|
||||
@ -19,17 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||
|
||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||
|
||||
export MPI_BUF_NUMA=$NUMAH
|
||||
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||
|
||||
if [ $PALS_RANKID = "0" ]
|
||||
then
|
||||
numactl -m $NUMAM -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||
# numactl -m $NUMAM -N $NUMAP "$@"
|
||||
numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||
# numactl -p $NUMAP -N $NUMAP "$@"
|
||||
else
|
||||
numactl -m $NUMAM -N $NUMAP "$@"
|
||||
numactl -p $NUMAP -N $NUMAP "$@"
|
||||
fi
|
||||
|
@ -1,6 +1,7 @@
|
||||
#Ahead of time compile for PVC
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
|
||||
|
||||
#JIT compile
|
||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||
|
Reference in New Issue
Block a user