mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
commit
9a1ad6a5eb
@ -210,8 +210,8 @@ void acceleratorInit(void)
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||
// theCopyAccelerator = new sycl::queue (selectedDevice);
|
||||
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||
theCopyAccelerator = new sycl::queue (selectedDevice);
|
||||
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
zeInit(0);
|
||||
|
@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global)
|
||||
}
|
||||
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||
{
|
||||
if(LoggingMode == LoggingModeNone) return;
|
||||
|
||||
if ( ChecksumCommsSend ){
|
||||
uint64_t *ubuf = (uint64_t *)buf;
|
||||
if(LoggingMode == LoggingModeNone) return;
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||
if(LoggingMode == LoggingModePrint) {
|
||||
|
67
systems/Aurora/benchmarks/bench1.pbs
Normal file
67
systems/Aurora/benchmarks/bench1.pbs
Normal file
@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
|
||||
#PBS -q debug
|
||||
#PBS -l select=1
|
||||
#PBS -l walltime=00:20:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
module load pti-gpu
|
||||
|
||||
#cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 2 nodes, 24 ranks
|
||||
#
|
||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
#$CMD | tee 1node.comms
|
||||
|
||||
|
||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
|
||||
#$CMD | tee 1tile.dwf
|
||||
|
||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 1node.32.32.32.48.dwf
|
||||
|
||||
|
||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
#$CMD | tee 1node.64.64.32.96.dwf
|
||||
|
||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
#$CMD | tee 1node.64.32.32.48.dwf
|
||||
|
@ -1,10 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
#PBS -q workq
|
||||
#PBS -l select=2
|
||||
#PBS -l walltime=01:00:00
|
||||
#PBS -l walltime=00:20:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
@ -13,11 +11,13 @@
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
module load pti-gpu
|
||||
|
||||
export OMP_NUM_THREADS=3
|
||||
#cat $PBS_NODEFILE
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
# 12 ppn, 2 nodes, 24 ranks
|
||||
#
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
$CMD | tee 2node.comms
|
||||
|
||||
#$CMD
|
||||
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 2node.32.32.64.48.dwf
|
||||
|
||||
#$CMD
|
||||
|
||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
$CMD | tee 2node.64.64.64.96.dwf
|
||||
|
||||
$CMD
|
||||
|
||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||
./gpu_tile_compact.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
|
||||
$CMD
|
@ -1,33 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
|
||||
#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||
export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
|
||||
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||
#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
|
||||
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
|
||||
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
|
||||
|
||||
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
|
||||
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
|
||||
|
||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
||||
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
|
||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
|
||||
|
||||
#export GRID_MPICH_NIC_BIND=$NIC
|
||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
|
||||
|
||||
unset EnableWalkerPartition
|
||||
export EnableImplicitScaling=0
|
||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
|
||||
export ZE_AFFINITY_MASK=$gpu_id
|
||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||
|
||||
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||
|
||||
#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||
|
||||
numactl -m $NUMA -N $NUMAP "$@"
|
||||
if [ $PALS_RANKID = "0" ]
|
||||
then
|
||||
# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@"
|
||||
# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
else
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
fi
|
||||
|
@ -7,7 +7,7 @@
|
||||
--disable-fermion-reps \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=no\
|
||||
--enable-accelerator-aware-mpi=yes\
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx \
|
||||
|
@ -1,7 +1,9 @@
|
||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
||||
|
||||
module use /soft/modulefiles
|
||||
module load intel_compute_runtime/release/agama-devel-682.22
|
||||
module load oneapi/release/2023.12.15.001
|
||||
|
||||
#module use /soft/modulefiles
|
||||
#module load intel_compute_runtime/release/agama-devel-682.22
|
||||
|
||||
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||
export FI_CXI_CQ_FILL_PERCENT=20
|
||||
|
Loading…
Reference in New Issue
Block a user