mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
		
							
								
								
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
#PBS -q debug
 | 
			
		||||
#PBS -l select=1
 | 
			
		||||
#PBS -l walltime=00:20:00
 | 
			
		||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
			
		||||
 | 
			
		||||
#export OMP_PROC_BIND=spread
 | 
			
		||||
#unset OMP_PLACES
 | 
			
		||||
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
source ../sourceme.sh
 | 
			
		||||
module load pti-gpu
 | 
			
		||||
 | 
			
		||||
#cat $PBS_NODEFILE
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=4
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
# 12 ppn, 2 nodes, 24 ranks
 | 
			
		||||
#
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 | 
			
		||||
#$CMD | tee 1node.comms
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
 | 
			
		||||
#$CMD | tee 1tile.dwf
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
$CMD | tee 1node.32.32.32.48.dwf
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.64.64.32.96.dwf
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.64.32.32.48.dwf
 | 
			
		||||
 | 
			
		||||
@@ -1,10 +1,8 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
			
		||||
 | 
			
		||||
#PBS -q EarlyAppAccess
 | 
			
		||||
#PBS -q workq
 | 
			
		||||
#PBS -l select=2
 | 
			
		||||
#PBS -l walltime=01:00:00
 | 
			
		||||
#PBS -l walltime=00:20:00
 | 
			
		||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
			
		||||
 | 
			
		||||
#export OMP_PROC_BIND=spread
 | 
			
		||||
@@ -13,11 +11,13 @@
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
source ../sourceme.sh
 | 
			
		||||
module load pti-gpu
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=3
 | 
			
		||||
#cat $PBS_NODEFILE
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=4
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
@@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
# 12 ppn, 2 nodes, 24 ranks
 | 
			
		||||
#
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 | 
			
		||||
$CMD | tee 2node.comms
 | 
			
		||||
 | 
			
		||||
#$CMD 
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
$CMD | tee 2node.32.32.64.48.dwf
 | 
			
		||||
 | 
			
		||||
#$CMD 
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
$CMD | tee 2node.64.64.64.96.dwf
 | 
			
		||||
 | 
			
		||||
$CMD 
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
 | 
			
		||||
$CMD 
 | 
			
		||||
@@ -1,33 +1,34 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 | 
			
		||||
#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 | 
			
		||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 | 
			
		||||
export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 | 
			
		||||
export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 | 
			
		||||
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 | 
			
		||||
#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 | 
			
		||||
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
 | 
			
		||||
#export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
 | 
			
		||||
 | 
			
		||||
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
 | 
			
		||||
export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
 | 
			
		||||
 | 
			
		||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
  
 | 
			
		||||
#export GRID_MPICH_NIC_BIND=$NIC
 | 
			
		||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 | 
			
		||||
 | 
			
		||||
unset EnableWalkerPartition
 | 
			
		||||
export EnableImplicitScaling=0
 | 
			
		||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 | 
			
		||||
export ZE_AFFINITY_MASK=$gpu_id
 | 
			
		||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
 | 
			
		||||
 | 
			
		||||
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 | 
			
		||||
 | 
			
		||||
#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 | 
			
		||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 | 
			
		||||
 | 
			
		||||
numactl -m $NUMA -N $NUMAP  "$@"
 | 
			
		||||
if [ $PALS_RANKID = "0" ]
 | 
			
		||||
then
 | 
			
		||||
#    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@"
 | 
			
		||||
#    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
 | 
			
		||||
    numactl -m $NUMA -N $NUMA  "$@"
 | 
			
		||||
else 
 | 
			
		||||
    numactl -m $NUMA -N $NUMA  "$@"
 | 
			
		||||
fi
 | 
			
		||||
 
 | 
			
		||||
@@ -7,7 +7,7 @@
 | 
			
		||||
	--disable-fermion-reps \
 | 
			
		||||
	--enable-shm=nvlink \
 | 
			
		||||
	--enable-accelerator=sycl \
 | 
			
		||||
	--enable-accelerator-aware-mpi=no\
 | 
			
		||||
	--enable-accelerator-aware-mpi=yes\
 | 
			
		||||
	--enable-unified=no \
 | 
			
		||||
	MPICXX=mpicxx \
 | 
			
		||||
	CXX=icpx \
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,9 @@
 | 
			
		||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 | 
			
		||||
 | 
			
		||||
module use /soft/modulefiles
 | 
			
		||||
module load intel_compute_runtime/release/agama-devel-682.22
 | 
			
		||||
module load oneapi/release/2023.12.15.001
 | 
			
		||||
 | 
			
		||||
#module use /soft/modulefiles
 | 
			
		||||
#module load intel_compute_runtime/release/agama-devel-682.22
 | 
			
		||||
 | 
			
		||||
export FI_CXI_DEFAULT_CQ_SIZE=131072
 | 
			
		||||
export FI_CXI_CQ_FILL_PERCENT=20
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user