Initial commit
This commit is contained in:
commit
ade190016a
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
*.code-workspace
|
BIN
2-racks/rack-power.db
Normal file
BIN
2-racks/rack-power.db
Normal file
Binary file not shown.
5
2-racks/size-C0/16-nodes/.geom
Normal file
5
2-racks/size-C0/16-nodes/.geom
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
nnodes : 16
|
||||||
|
ntasks : 64
|
||||||
|
partition : gpu
|
||||||
|
mpi-geom : 2.2.2.8
|
||||||
|
grid-geom : 48.48.48.96
|
13
2-racks/size-C0/16-nodes/cpu-mpi-wrapper.sh
Normal file
13
2-racks/size-C0/16-nodes/cpu-mpi-wrapper.sh
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||||
|
numa=${lrank}
|
||||||
|
cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))"
|
||||||
|
places="$(( lrank*16 )):$(( (lrank+1)*16 ))"
|
||||||
|
|
||||||
|
BINDING="taskset -c ${cpus} numactl -m ${numa}"
|
||||||
|
export OMP_PLACES=${places}
|
||||||
|
|
||||||
|
echo "$(hostname) - ${lrank} binding='${BINDING}'"
|
||||||
|
|
||||||
|
${BINDING} "$@"
|
1
2-racks/size-C0/16-nodes/dwf_fp32.tok
Symbolic link
1
2-racks/size-C0/16-nodes/dwf_fp32.tok
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../dwf_fp32.tok
|
14
2-racks/size-C0/16-nodes/gpu-mpi-wrapper.sh
Executable file
14
2-racks/size-C0/16-nodes/gpu-mpi-wrapper.sh
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||||
|
numa1=$(( 2 * lrank))
|
||||||
|
numa2=$(( 2 * lrank + 1 ))
|
||||||
|
netdev=mlx5_${lrank}:1
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
|
||||||
|
export UCX_NET_DEVICES=${netdev}
|
||||||
|
BINDING="--interleave=$numa1,$numa2"
|
||||||
|
|
||||||
|
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
|
||||||
|
|
||||||
|
numactl ${BINDING} "$@"
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:25:12 BST 2022
|
||||||
|
epoch 1661023512
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffef5f3f000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015459e0bd000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015459dcf5000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015459d803000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015459d4d9000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015459d1f8000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015459cf97000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015459e044000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015459cbb7000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x000015459b45b000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015459b08b000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015459adea000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015459acbf000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x000015459a93d000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015459a706000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015459a4ee000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x000015459a2ce000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x0000154599f09000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x0000154599d05000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x000015459df0d000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x0000154599afd000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015459df78000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015459df73000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001545999f1000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001545997e7000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x00001545995e3000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ea00000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.499143 s : Grid Layout
|
||||||
|
Grid : Message : 1.499148 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.499155 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.499157 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.515541 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.532470 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.550455 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.550491 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.937366 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.163040 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.163078 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.467109 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.261272 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.380110 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.388989 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.599668 s : *****************************************************************
|
||||||
|
Grid : Message : 13.599694 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.599696 s : *****************************************************************
|
||||||
|
Grid : Message : 13.599700 s : *****************************************************************
|
||||||
|
Grid : Message : 13.599702 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.599705 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.599708 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.599710 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.599712 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.599716 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.599719 s : *****************************************************************
|
||||||
|
Grid : Message : 14.992290 s : Called warmup
|
||||||
|
Grid : Message : 104.236264 s : Called Dw 30000 times in 9.01365e+07 us
|
||||||
|
Grid : Message : 104.236329 s : mflop/s = 7.46293e+07
|
||||||
|
Grid : Message : 104.236331 s : mflop/s per rank = 1.16608e+06
|
||||||
|
Grid : Message : 104.236333 s : mflop/s per node = 4.66433e+06
|
||||||
|
Grid : Message : 104.236335 s : RF GiB/s (base 2) = 151645
|
||||||
|
Grid : Message : 104.236337 s : mem GiB/s (base 2) = 94778.1
|
||||||
|
Grid : Message : 104.236908 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 104.247209 s : #### Dhop calls report
|
||||||
|
Grid : Message : 104.247215 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 104.247219 s : WilsonFermion5D TotalTime /Calls : 1503.52 us
|
||||||
|
Grid : Message : 104.247221 s : WilsonFermion5D CommTime /Calls : 1054.2 us
|
||||||
|
Grid : Message : 104.247223 s : WilsonFermion5D FaceTime /Calls : 225.375 us
|
||||||
|
Grid : Message : 104.247225 s : WilsonFermion5D ComputeTime1/Calls : 3.01152 us
|
||||||
|
Grid : Message : 104.247227 s : WilsonFermion5D ComputeTime2/Calls : 236.377 us
|
||||||
|
Grid : Message : 104.247294 s : Average mflops/s per call : 3.59587e+10
|
||||||
|
Grid : Message : 104.247300 s : Average mflops/s per call per rank : 5.61855e+08
|
||||||
|
Grid : Message : 104.247303 s : Average mflops/s per call per node : 2.24742e+09
|
||||||
|
Grid : Message : 104.247305 s : Average mflops/s per call (full) : 7.59233e+07
|
||||||
|
Grid : Message : 104.247307 s : Average mflops/s per call per rank (full): 1.1863e+06
|
||||||
|
Grid : Message : 104.247309 s : Average mflops/s per call per node (full): 4.7452e+06
|
||||||
|
Grid : Message : 104.247311 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 104.247312 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 104.247313 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 104.247314 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 104.247315 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 104.247316 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 112.998074 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 112.998099 s : Called DwDag
|
||||||
|
Grid : Message : 112.998100 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 113.585000 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 113.380300 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 113.140290 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 113.790730 s : src_e0.5
|
||||||
|
Grid : Message : 113.153215 s : src_o0.5
|
||||||
|
Grid : Message : 113.170341 s : *********************************************************
|
||||||
|
Grid : Message : 113.170346 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 113.170347 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 113.170353 s : * SINGLE precision
|
||||||
|
Grid : Message : 113.170356 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 113.170357 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 113.170361 s : *********************************************************
|
||||||
|
Grid : Message : 161.702832 s : Deo mflop/s = 6.93159e+07
|
||||||
|
Grid : Message : 161.702861 s : Deo mflop/s per rank 1.08306e+06
|
||||||
|
Grid : Message : 161.702863 s : Deo mflop/s per node 4.33224e+06
|
||||||
|
Grid : Message : 161.702866 s : #### Dhop calls report
|
||||||
|
Grid : Message : 161.702868 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 161.702870 s : WilsonFermion5D TotalTime /Calls : 1617.57 us
|
||||||
|
Grid : Message : 161.702872 s : WilsonFermion5D CommTime /Calls : 1105.14 us
|
||||||
|
Grid : Message : 161.702874 s : WilsonFermion5D FaceTime /Calls : 294.218 us
|
||||||
|
Grid : Message : 161.702876 s : WilsonFermion5D ComputeTime1/Calls : 4.85114 us
|
||||||
|
Grid : Message : 161.702878 s : WilsonFermion5D ComputeTime2/Calls : 241.569 us
|
||||||
|
Grid : Message : 161.702900 s : Average mflops/s per call : 2.0686e+10
|
||||||
|
Grid : Message : 161.702904 s : Average mflops/s per call per rank : 3.23219e+08
|
||||||
|
Grid : Message : 161.702906 s : Average mflops/s per call per node : 1.29288e+09
|
||||||
|
Grid : Message : 161.702908 s : Average mflops/s per call (full) : 7.05701e+07
|
||||||
|
Grid : Message : 161.702912 s : Average mflops/s per call per rank (full): 1.10266e+06
|
||||||
|
Grid : Message : 161.702914 s : Average mflops/s per call per node (full): 4.41063e+06
|
||||||
|
Grid : Message : 161.702920 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 161.702922 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 161.702923 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 161.702926 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 161.702927 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 161.702928 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 161.722751 s : r_e6.02106
|
||||||
|
Grid : Message : 161.724439 s : r_o6.0211
|
||||||
|
Grid : Message : 161.725861 s : res12.0422
|
||||||
|
Grid : Message : 161.827558 s : norm diff 0
|
||||||
|
Grid : Message : 161.972191 s : norm diff even 0
|
||||||
|
Grid : Message : 162.433730 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1005.64059/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1005
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1005
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:22:21 BST 2022
|
||||||
|
epoch 1661023341
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:37:35 BST 2022
|
||||||
|
epoch 1661024255
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffff456d000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154c9a375000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154c99fad000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154c99abb000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154c99791000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154c994b0000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154c9924f000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154c9a2fc000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154c98e6f000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154c97713000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154c97343000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154c970a2000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154c96f77000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x0000154c96bf5000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154c969be000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154c967a6000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154c96586000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x0000154c961c1000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x0000154c95fbd000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x0000154c9a1c5000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x0000154c95db5000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154c9a230000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154c9a22b000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154c95ca9000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154c95a9f000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x0000154c9589b000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14d8e0000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.312638 s : Grid Layout
|
||||||
|
Grid : Message : 1.312643 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.312650 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.312652 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.327971 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.344471 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.361018 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.361045 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.837887 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.844490 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.845110 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.428202 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.439960 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.560999 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.573339 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.695651 s : *****************************************************************
|
||||||
|
Grid : Message : 13.695676 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.695677 s : *****************************************************************
|
||||||
|
Grid : Message : 13.695678 s : *****************************************************************
|
||||||
|
Grid : Message : 13.695679 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.695680 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.695681 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.695682 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.695684 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.695685 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.695686 s : *****************************************************************
|
||||||
|
Grid : Message : 14.234933 s : Called warmup
|
||||||
|
Grid : Message : 103.428452 s : Called Dw 30000 times in 8.91932e+07 us
|
||||||
|
Grid : Message : 103.428517 s : mflop/s = 7.54186e+07
|
||||||
|
Grid : Message : 103.428519 s : mflop/s per rank = 1.17842e+06
|
||||||
|
Grid : Message : 103.428521 s : mflop/s per node = 4.71366e+06
|
||||||
|
Grid : Message : 103.428523 s : RF GiB/s (base 2) = 153249
|
||||||
|
Grid : Message : 103.428525 s : mem GiB/s (base 2) = 95780.5
|
||||||
|
Grid : Message : 103.429097 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 103.439111 s : #### Dhop calls report
|
||||||
|
Grid : Message : 103.439118 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 103.439122 s : WilsonFermion5D TotalTime /Calls : 1487.69 us
|
||||||
|
Grid : Message : 103.439124 s : WilsonFermion5D CommTime /Calls : 1041.46 us
|
||||||
|
Grid : Message : 103.439126 s : WilsonFermion5D FaceTime /Calls : 222.459 us
|
||||||
|
Grid : Message : 103.439128 s : WilsonFermion5D ComputeTime1/Calls : 2.85969 us
|
||||||
|
Grid : Message : 103.439130 s : WilsonFermion5D ComputeTime2/Calls : 236.325 us
|
||||||
|
Grid : Message : 103.439201 s : Average mflops/s per call : 3.60313e+10
|
||||||
|
Grid : Message : 103.439207 s : Average mflops/s per call per rank : 5.62989e+08
|
||||||
|
Grid : Message : 103.439209 s : Average mflops/s per call per node : 2.25196e+09
|
||||||
|
Grid : Message : 103.439211 s : Average mflops/s per call (full) : 7.67311e+07
|
||||||
|
Grid : Message : 103.439213 s : Average mflops/s per call per rank (full): 1.19892e+06
|
||||||
|
Grid : Message : 103.439215 s : Average mflops/s per call per node (full): 4.7957e+06
|
||||||
|
Grid : Message : 103.439217 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 103.439218 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 103.439219 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 103.439220 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 103.439221 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 103.439222 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 112.177904 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 112.177939 s : Called DwDag
|
||||||
|
Grid : Message : 112.177940 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 112.186235 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 112.189309 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 112.200523 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 112.263704 s : src_e0.5
|
||||||
|
Grid : Message : 112.335429 s : src_o0.5
|
||||||
|
Grid : Message : 112.352238 s : *********************************************************
|
||||||
|
Grid : Message : 112.352244 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 112.352246 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 112.352248 s : * SINGLE precision
|
||||||
|
Grid : Message : 112.352250 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 112.352253 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 112.352254 s : *********************************************************
|
||||||
|
Grid : Message : 160.328889 s : Deo mflop/s = 7.01193e+07
|
||||||
|
Grid : Message : 160.328922 s : Deo mflop/s per rank 1.09561e+06
|
||||||
|
Grid : Message : 160.328924 s : Deo mflop/s per node 4.38246e+06
|
||||||
|
Grid : Message : 160.328927 s : #### Dhop calls report
|
||||||
|
Grid : Message : 160.328929 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 160.328931 s : WilsonFermion5D TotalTime /Calls : 1599.04 us
|
||||||
|
Grid : Message : 160.328933 s : WilsonFermion5D CommTime /Calls : 1088.05 us
|
||||||
|
Grid : Message : 160.328935 s : WilsonFermion5D FaceTime /Calls : 294.436 us
|
||||||
|
Grid : Message : 160.328937 s : WilsonFermion5D ComputeTime1/Calls : 4.78577 us
|
||||||
|
Grid : Message : 160.328939 s : WilsonFermion5D ComputeTime2/Calls : 241.411 us
|
||||||
|
Grid : Message : 160.328966 s : Average mflops/s per call : 2.07599e+10
|
||||||
|
Grid : Message : 160.328971 s : Average mflops/s per call per rank : 3.24373e+08
|
||||||
|
Grid : Message : 160.328975 s : Average mflops/s per call per node : 1.29749e+09
|
||||||
|
Grid : Message : 160.328980 s : Average mflops/s per call (full) : 7.13878e+07
|
||||||
|
Grid : Message : 160.328983 s : Average mflops/s per call per rank (full): 1.11543e+06
|
||||||
|
Grid : Message : 160.328987 s : Average mflops/s per call per node (full): 4.46174e+06
|
||||||
|
Grid : Message : 160.328989 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 160.328990 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 160.328992 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 160.328995 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 160.328997 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 160.329000 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 160.348014 s : r_e6.02106
|
||||||
|
Grid : Message : 160.350033 s : r_o6.0211
|
||||||
|
Grid : Message : 160.351497 s : res12.0422
|
||||||
|
Grid : Message : 160.466811 s : norm diff 0
|
||||||
|
Grid : Message : 160.599190 s : norm diff even 0
|
||||||
|
Grid : Message : 160.669838 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1020.64063/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1020
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1020
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:34:46 BST 2022
|
||||||
|
epoch 1661024086
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:43:25 BST 2022
|
||||||
|
epoch 1661024605
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffd625a8000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ff21a6a000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ff216a2000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ff211b0000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ff20e86000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ff20ba5000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ff20944000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ff219f1000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ff20564000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ff1ee08000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ff1ea38000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ff1e797000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ff1e66c000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x000014ff1e2ea000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ff1e0b3000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ff1de9b000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ff1dc7b000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x000014ff1d8b6000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x000014ff1d6b2000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x000014ff218ba000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x000014ff1d4aa000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ff21925000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ff21920000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ff1d39e000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ff1d194000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x000014ff1cf90000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146a80000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.623478 s : Grid Layout
|
||||||
|
Grid : Message : 1.623482 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.623486 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.623488 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.637678 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.654638 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.670417 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.670443 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 2.165386 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.399472 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.399504 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.787095 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.568006 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.661012 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.665024 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.760660 s : *****************************************************************
|
||||||
|
Grid : Message : 13.760685 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.760687 s : *****************************************************************
|
||||||
|
Grid : Message : 13.760690 s : *****************************************************************
|
||||||
|
Grid : Message : 13.760691 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.760692 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.760694 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.760696 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.760697 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.760698 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.760700 s : *****************************************************************
|
||||||
|
Grid : Message : 14.326353 s : Called warmup
|
||||||
|
Grid : Message : 102.469231 s : Called Dw 30000 times in 8.81428e+07 us
|
||||||
|
Grid : Message : 102.469296 s : mflop/s = 7.63173e+07
|
||||||
|
Grid : Message : 102.469299 s : mflop/s per rank = 1.19246e+06
|
||||||
|
Grid : Message : 102.469307 s : mflop/s per node = 4.76983e+06
|
||||||
|
Grid : Message : 102.469310 s : RF GiB/s (base 2) = 155075
|
||||||
|
Grid : Message : 102.469313 s : mem GiB/s (base 2) = 96921.9
|
||||||
|
Grid : Message : 102.469886 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 102.480527 s : #### Dhop calls report
|
||||||
|
Grid : Message : 102.480534 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 102.480538 s : WilsonFermion5D TotalTime /Calls : 1470.47 us
|
||||||
|
Grid : Message : 102.480540 s : WilsonFermion5D CommTime /Calls : 1029.89 us
|
||||||
|
Grid : Message : 102.480542 s : WilsonFermion5D FaceTime /Calls : 217.938 us
|
||||||
|
Grid : Message : 102.480544 s : WilsonFermion5D ComputeTime1/Calls : 3.09645 us
|
||||||
|
Grid : Message : 102.480546 s : WilsonFermion5D ComputeTime2/Calls : 235.402 us
|
||||||
|
Grid : Message : 102.480575 s : Average mflops/s per call : 3.61099e+10
|
||||||
|
Grid : Message : 102.480579 s : Average mflops/s per call per rank : 5.64217e+08
|
||||||
|
Grid : Message : 102.480581 s : Average mflops/s per call per node : 2.25687e+09
|
||||||
|
Grid : Message : 102.480583 s : Average mflops/s per call (full) : 7.76299e+07
|
||||||
|
Grid : Message : 102.480587 s : Average mflops/s per call per rank (full): 1.21297e+06
|
||||||
|
Grid : Message : 102.480590 s : Average mflops/s per call per node (full): 4.85187e+06
|
||||||
|
Grid : Message : 102.480593 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 102.480596 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 102.480598 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 102.480600 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 102.480603 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 102.480605 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 111.202302 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 111.202331 s : Called DwDag
|
||||||
|
Grid : Message : 111.202332 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 111.204652 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 111.207748 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 111.218376 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 111.273653 s : src_e0.5
|
||||||
|
Grid : Message : 111.352934 s : src_o0.5
|
||||||
|
Grid : Message : 111.369965 s : *********************************************************
|
||||||
|
Grid : Message : 111.369970 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 111.369974 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 111.369976 s : * SINGLE precision
|
||||||
|
Grid : Message : 111.369977 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 111.369981 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 111.369983 s : *********************************************************
|
||||||
|
Grid : Message : 158.806725 s : Deo mflop/s = 7.09164e+07
|
||||||
|
Grid : Message : 158.806755 s : Deo mflop/s per rank 1.10807e+06
|
||||||
|
Grid : Message : 158.806757 s : Deo mflop/s per node 4.43227e+06
|
||||||
|
Grid : Message : 158.806760 s : #### Dhop calls report
|
||||||
|
Grid : Message : 158.806762 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 158.806764 s : WilsonFermion5D TotalTime /Calls : 1581.06 us
|
||||||
|
Grid : Message : 158.806766 s : WilsonFermion5D CommTime /Calls : 1077.77 us
|
||||||
|
Grid : Message : 158.806768 s : WilsonFermion5D FaceTime /Calls : 286.721 us
|
||||||
|
Grid : Message : 158.806770 s : WilsonFermion5D ComputeTime1/Calls : 4.98297 us
|
||||||
|
Grid : Message : 158.806772 s : WilsonFermion5D ComputeTime2/Calls : 240.035 us
|
||||||
|
Grid : Message : 158.806792 s : Average mflops/s per call : 2.0753e+10
|
||||||
|
Grid : Message : 158.806796 s : Average mflops/s per call per rank : 3.24266e+08
|
||||||
|
Grid : Message : 158.806798 s : Average mflops/s per call per node : 1.29706e+09
|
||||||
|
Grid : Message : 158.806800 s : Average mflops/s per call (full) : 7.21996e+07
|
||||||
|
Grid : Message : 158.806804 s : Average mflops/s per call per rank (full): 1.12812e+06
|
||||||
|
Grid : Message : 158.806807 s : Average mflops/s per call per node (full): 4.51247e+06
|
||||||
|
Grid : Message : 158.806809 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 158.806810 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 158.806812 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 158.806814 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 158.806816 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 158.806818 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 158.823821 s : r_e6.02106
|
||||||
|
Grid : Message : 158.827207 s : r_o6.0211
|
||||||
|
Grid : Message : 158.828617 s : res12.0422
|
||||||
|
Grid : Message : 158.938772 s : norm diff 0
|
||||||
|
Grid : Message : 159.724700 s : norm diff even 0
|
||||||
|
Grid : Message : 159.148761 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1035.64067/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1035
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1035
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:40:36 BST 2022
|
||||||
|
epoch 1661024436
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:49:15 BST 2022
|
||||||
|
epoch 1661024955
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffe2b5fb000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470cbce5000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470cb91d000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470cb42b000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470cb101000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470cae20000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470cabbf000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470cbc6c000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470ca7df000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470c9083000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470c8cb3000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470c8a12000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470c88e7000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x00001470c8565000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470c832e000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470c8116000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470c7ef6000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x00001470c7b31000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x00001470c792d000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x00001470cbb35000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x00001470c7725000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470cbba0000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470cbb9b000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470c7619000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470c740f000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x00001470c720b000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f600000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.720184 s : Grid Layout
|
||||||
|
Grid : Message : 1.720188 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.720196 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.720199 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.735275 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.752323 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.768478 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.768504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 2.201838 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.438683 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.438714 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.906459 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.718015 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.851801 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.862438 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.896599 s : *****************************************************************
|
||||||
|
Grid : Message : 13.896621 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.896622 s : *****************************************************************
|
||||||
|
Grid : Message : 13.896623 s : *****************************************************************
|
||||||
|
Grid : Message : 13.896624 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.896625 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.896626 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.896627 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.896628 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.896629 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.896630 s : *****************************************************************
|
||||||
|
Grid : Message : 14.428387 s : Called warmup
|
||||||
|
Grid : Message : 101.915473 s : Called Dw 30000 times in 8.74869e+07 us
|
||||||
|
Grid : Message : 101.915527 s : mflop/s = 7.68895e+07
|
||||||
|
Grid : Message : 101.915529 s : mflop/s per rank = 1.2014e+06
|
||||||
|
Grid : Message : 101.915531 s : mflop/s per node = 4.80559e+06
|
||||||
|
Grid : Message : 101.915533 s : RF GiB/s (base 2) = 156238
|
||||||
|
Grid : Message : 101.915535 s : mem GiB/s (base 2) = 97648.5
|
||||||
|
Grid : Message : 101.916107 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 101.926218 s : #### Dhop calls report
|
||||||
|
Grid : Message : 101.926225 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 101.926228 s : WilsonFermion5D TotalTime /Calls : 1459.21 us
|
||||||
|
Grid : Message : 101.926230 s : WilsonFermion5D CommTime /Calls : 1016.78 us
|
||||||
|
Grid : Message : 101.926232 s : WilsonFermion5D FaceTime /Calls : 219.506 us
|
||||||
|
Grid : Message : 101.926234 s : WilsonFermion5D ComputeTime1/Calls : 2.78512 us
|
||||||
|
Grid : Message : 101.926236 s : WilsonFermion5D ComputeTime2/Calls : 235.25 us
|
||||||
|
Grid : Message : 101.926330 s : Average mflops/s per call : 3.60206e+10
|
||||||
|
Grid : Message : 101.926334 s : Average mflops/s per call per rank : 5.62822e+08
|
||||||
|
Grid : Message : 101.926336 s : Average mflops/s per call per node : 2.25129e+09
|
||||||
|
Grid : Message : 101.926338 s : Average mflops/s per call (full) : 7.82287e+07
|
||||||
|
Grid : Message : 101.926340 s : Average mflops/s per call per rank (full): 1.22232e+06
|
||||||
|
Grid : Message : 101.926342 s : Average mflops/s per call per node (full): 4.88929e+06
|
||||||
|
Grid : Message : 101.926344 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 101.926345 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 101.926346 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 101.926347 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 101.926348 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 101.926349 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 110.616405 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 110.616430 s : Called DwDag
|
||||||
|
Grid : Message : 110.616431 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 110.621134 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 110.624323 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 110.637247 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 110.698940 s : src_e0.5
|
||||||
|
Grid : Message : 110.766761 s : src_o0.5
|
||||||
|
Grid : Message : 110.783307 s : *********************************************************
|
||||||
|
Grid : Message : 110.783311 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 110.783313 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 110.783315 s : * SINGLE precision
|
||||||
|
Grid : Message : 110.783316 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 110.783317 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 110.783318 s : *********************************************************
|
||||||
|
Grid : Message : 157.764942 s : Deo mflop/s = 7.16075e+07
|
||||||
|
Grid : Message : 157.764976 s : Deo mflop/s per rank 1.11887e+06
|
||||||
|
Grid : Message : 157.764978 s : Deo mflop/s per node 4.47547e+06
|
||||||
|
Grid : Message : 157.764981 s : #### Dhop calls report
|
||||||
|
Grid : Message : 157.764983 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 157.764985 s : WilsonFermion5D TotalTime /Calls : 1565.89 us
|
||||||
|
Grid : Message : 157.764987 s : WilsonFermion5D CommTime /Calls : 1058.27 us
|
||||||
|
Grid : Message : 157.764989 s : WilsonFermion5D FaceTime /Calls : 292.487 us
|
||||||
|
Grid : Message : 157.764991 s : WilsonFermion5D ComputeTime1/Calls : 4.72584 us
|
||||||
|
Grid : Message : 157.764993 s : WilsonFermion5D ComputeTime2/Calls : 239.678 us
|
||||||
|
Grid : Message : 157.765020 s : Average mflops/s per call : 2.07994e+10
|
||||||
|
Grid : Message : 157.765024 s : Average mflops/s per call per rank : 3.2499e+08
|
||||||
|
Grid : Message : 157.765027 s : Average mflops/s per call per node : 1.29996e+09
|
||||||
|
Grid : Message : 157.765031 s : Average mflops/s per call (full) : 7.28994e+07
|
||||||
|
Grid : Message : 157.765035 s : Average mflops/s per call per rank (full): 1.13905e+06
|
||||||
|
Grid : Message : 157.765039 s : Average mflops/s per call per node (full): 4.55621e+06
|
||||||
|
Grid : Message : 157.765042 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 157.765044 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 157.765046 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 157.765049 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 157.765051 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 157.765053 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 157.783731 s : r_e6.02106
|
||||||
|
Grid : Message : 157.786036 s : r_o6.0211
|
||||||
|
Grid : Message : 157.787470 s : res12.0422
|
||||||
|
Grid : Message : 157.905573 s : norm diff 0
|
||||||
|
Grid : Message : 158.337590 s : norm diff even 0
|
||||||
|
Grid : Message : 158.959010 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1050.64071/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1050
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1050
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:46:27 BST 2022
|
||||||
|
epoch 1661024788
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:55:03 BST 2022
|
||||||
|
epoch 1661025303
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffd9b1d1000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a2805dc000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a280214000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a27fd22000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a27f9f8000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a27f717000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a27f4b6000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a280563000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a27f0d6000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a27d97a000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a27d5aa000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a27d309000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a27d1de000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x000014a27ce5c000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a27cc25000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a27ca0d000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a27c7ed000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x000014a27c428000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x000014a27c224000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x000014a28042c000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x000014a27c01c000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a280497000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a280492000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a27bf10000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a27bd06000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x000014a27bb02000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150120000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.428183 s : Grid Layout
|
||||||
|
Grid : Message : 1.428187 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.428193 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.428196 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.443217 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.455165 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.471981 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.472007 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.853366 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.875960 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.876470 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.305707 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.397843 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.484443 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.488387 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.563627 s : *****************************************************************
|
||||||
|
Grid : Message : 13.563653 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.563655 s : *****************************************************************
|
||||||
|
Grid : Message : 13.563658 s : *****************************************************************
|
||||||
|
Grid : Message : 13.563659 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.563660 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.563663 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.563665 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.563667 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.563668 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.563669 s : *****************************************************************
|
||||||
|
Grid : Message : 14.958310 s : Called warmup
|
||||||
|
Grid : Message : 101.445133 s : Called Dw 30000 times in 8.73489e+07 us
|
||||||
|
Grid : Message : 101.445198 s : mflop/s = 7.7011e+07
|
||||||
|
Grid : Message : 101.445200 s : mflop/s per rank = 1.2033e+06
|
||||||
|
Grid : Message : 101.445202 s : mflop/s per node = 4.81319e+06
|
||||||
|
Grid : Message : 101.445204 s : RF GiB/s (base 2) = 156485
|
||||||
|
Grid : Message : 101.445206 s : mem GiB/s (base 2) = 97802.9
|
||||||
|
Grid : Message : 101.445777 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 101.455931 s : #### Dhop calls report
|
||||||
|
Grid : Message : 101.455939 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 101.455943 s : WilsonFermion5D TotalTime /Calls : 1457.12 us
|
||||||
|
Grid : Message : 101.455945 s : WilsonFermion5D CommTime /Calls : 1014.92 us
|
||||||
|
Grid : Message : 101.455947 s : WilsonFermion5D FaceTime /Calls : 219.441 us
|
||||||
|
Grid : Message : 101.455949 s : WilsonFermion5D ComputeTime1/Calls : 2.84344 us
|
||||||
|
Grid : Message : 101.455951 s : WilsonFermion5D ComputeTime2/Calls : 235.367 us
|
||||||
|
Grid : Message : 101.455978 s : Average mflops/s per call : 3.61947e+10
|
||||||
|
Grid : Message : 101.455982 s : Average mflops/s per call per rank : 5.65543e+08
|
||||||
|
Grid : Message : 101.455984 s : Average mflops/s per call per node : 2.26217e+09
|
||||||
|
Grid : Message : 101.455986 s : Average mflops/s per call (full) : 7.83407e+07
|
||||||
|
Grid : Message : 101.455990 s : Average mflops/s per call per rank (full): 1.22407e+06
|
||||||
|
Grid : Message : 101.455992 s : Average mflops/s per call per node (full): 4.8963e+06
|
||||||
|
Grid : Message : 101.455994 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 101.455995 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 101.455999 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 101.456001 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 101.456002 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 101.456004 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 110.188024 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 110.188051 s : Called DwDag
|
||||||
|
Grid : Message : 110.188052 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 110.200211 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 110.203215 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 110.213199 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 110.281787 s : src_e0.5
|
||||||
|
Grid : Message : 110.353808 s : src_o0.5
|
||||||
|
Grid : Message : 110.370985 s : *********************************************************
|
||||||
|
Grid : Message : 110.370991 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 110.370992 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 110.370995 s : * SINGLE precision
|
||||||
|
Grid : Message : 110.370997 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 110.370998 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 110.371000 s : *********************************************************
|
||||||
|
Grid : Message : 157.314519 s : Deo mflop/s = 7.16631e+07
|
||||||
|
Grid : Message : 157.314545 s : Deo mflop/s per rank 1.11974e+06
|
||||||
|
Grid : Message : 157.314547 s : Deo mflop/s per node 4.47894e+06
|
||||||
|
Grid : Message : 157.314550 s : #### Dhop calls report
|
||||||
|
Grid : Message : 157.314552 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 157.314554 s : WilsonFermion5D TotalTime /Calls : 1564.64 us
|
||||||
|
Grid : Message : 157.314556 s : WilsonFermion5D CommTime /Calls : 1060.37 us
|
||||||
|
Grid : Message : 157.314558 s : WilsonFermion5D FaceTime /Calls : 287.98 us
|
||||||
|
Grid : Message : 157.314560 s : WilsonFermion5D ComputeTime1/Calls : 4.91794 us
|
||||||
|
Grid : Message : 157.314562 s : WilsonFermion5D ComputeTime2/Calls : 239.551 us
|
||||||
|
Grid : Message : 157.314587 s : Average mflops/s per call : 2.07265e+10
|
||||||
|
Grid : Message : 157.314591 s : Average mflops/s per call per rank : 3.23852e+08
|
||||||
|
Grid : Message : 157.314593 s : Average mflops/s per call per node : 1.29541e+09
|
||||||
|
Grid : Message : 157.314596 s : Average mflops/s per call (full) : 7.29577e+07
|
||||||
|
Grid : Message : 157.314600 s : Average mflops/s per call per rank (full): 1.13996e+06
|
||||||
|
Grid : Message : 157.314602 s : Average mflops/s per call per node (full): 4.55985e+06
|
||||||
|
Grid : Message : 157.314605 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 157.314606 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 157.314608 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 157.314610 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 157.314613 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 157.314614 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 157.334523 s : r_e6.02106
|
||||||
|
Grid : Message : 157.336050 s : r_o6.0211
|
||||||
|
Grid : Message : 157.337424 s : res12.0422
|
||||||
|
Grid : Message : 157.450236 s : norm diff 0
|
||||||
|
Grid : Message : 157.586163 s : norm diff even 0
|
||||||
|
Grid : Message : 157.657558 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1065.64076/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1065
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1065
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:52:16 BST 2022
|
||||||
|
epoch 1661025136
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:00:52 BST 2022
|
||||||
|
epoch 1661025652
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffceffcb000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c73048f000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c7300c7000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c72fbd5000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c72f8ab000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c72f5ca000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c72f369000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c730416000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c72ef89000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c72d82d000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c72d45d000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c72d1bc000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c72d091000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x000014c72cd0f000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c72cad8000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c72c8c0000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c72c6a0000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x000014c72c2db000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x000014c72c0d7000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x000014c7302df000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x000014c72becf000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c73034a000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c730345000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c72bdc3000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c72bbb9000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x000014c72b9b5000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1548a0000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.498999 s : Grid Layout
|
||||||
|
Grid : Message : 1.499003 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.499009 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.499010 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.516697 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.528026 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.543296 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.543322 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.803104 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.280210 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.280810 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.463560 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.316566 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.441882 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.454498 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.615874 s : *****************************************************************
|
||||||
|
Grid : Message : 13.615901 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.615903 s : *****************************************************************
|
||||||
|
Grid : Message : 13.615904 s : *****************************************************************
|
||||||
|
Grid : Message : 13.615905 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.615906 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.615910 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.615912 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.615914 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.615916 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.615918 s : *****************************************************************
|
||||||
|
Grid : Message : 14.175758 s : Called warmup
|
||||||
|
Grid : Message : 100.948265 s : Called Dw 30000 times in 8.67724e+07 us
|
||||||
|
Grid : Message : 100.948328 s : mflop/s = 7.75226e+07
|
||||||
|
Grid : Message : 100.948330 s : mflop/s per rank = 1.21129e+06
|
||||||
|
Grid : Message : 100.948332 s : mflop/s per node = 4.84516e+06
|
||||||
|
Grid : Message : 100.948334 s : RF GiB/s (base 2) = 157524
|
||||||
|
Grid : Message : 100.948336 s : mem GiB/s (base 2) = 98452.5
|
||||||
|
Grid : Message : 100.948912 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 100.958922 s : #### Dhop calls report
|
||||||
|
Grid : Message : 100.958930 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 100.958934 s : WilsonFermion5D TotalTime /Calls : 1447.35 us
|
||||||
|
Grid : Message : 100.958936 s : WilsonFermion5D CommTime /Calls : 1006.18 us
|
||||||
|
Grid : Message : 100.958938 s : WilsonFermion5D FaceTime /Calls : 218.625 us
|
||||||
|
Grid : Message : 100.958940 s : WilsonFermion5D ComputeTime1/Calls : 2.6472 us
|
||||||
|
Grid : Message : 100.958942 s : WilsonFermion5D ComputeTime2/Calls : 235.108 us
|
||||||
|
Grid : Message : 100.958970 s : Average mflops/s per call : 3.6261e+10
|
||||||
|
Grid : Message : 100.958974 s : Average mflops/s per call per rank : 5.66578e+08
|
||||||
|
Grid : Message : 100.958976 s : Average mflops/s per call per node : 2.26631e+09
|
||||||
|
Grid : Message : 100.958978 s : Average mflops/s per call (full) : 7.88698e+07
|
||||||
|
Grid : Message : 100.958981 s : Average mflops/s per call per rank (full): 1.23234e+06
|
||||||
|
Grid : Message : 100.958983 s : Average mflops/s per call per node (full): 4.92936e+06
|
||||||
|
Grid : Message : 100.958986 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 100.958987 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 100.958988 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 100.958991 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 100.958992 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 100.958995 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 109.635912 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 109.635940 s : Called DwDag
|
||||||
|
Grid : Message : 109.635941 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 109.641498 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 109.644623 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 109.654599 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 109.718075 s : src_e0.5
|
||||||
|
Grid : Message : 109.790285 s : src_o0.5
|
||||||
|
Grid : Message : 109.807211 s : *********************************************************
|
||||||
|
Grid : Message : 109.807217 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 109.807219 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 109.807221 s : * SINGLE precision
|
||||||
|
Grid : Message : 109.807224 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 109.807225 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 109.807226 s : *********************************************************
|
||||||
|
Grid : Message : 156.357075 s : Deo mflop/s = 7.22704e+07
|
||||||
|
Grid : Message : 156.357109 s : Deo mflop/s per rank 1.12923e+06
|
||||||
|
Grid : Message : 156.357111 s : Deo mflop/s per node 4.5169e+06
|
||||||
|
Grid : Message : 156.357114 s : #### Dhop calls report
|
||||||
|
Grid : Message : 156.357116 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 156.357118 s : WilsonFermion5D TotalTime /Calls : 1551.51 us
|
||||||
|
Grid : Message : 156.357120 s : WilsonFermion5D CommTime /Calls : 1049.38 us
|
||||||
|
Grid : Message : 156.357122 s : WilsonFermion5D FaceTime /Calls : 285.792 us
|
||||||
|
Grid : Message : 156.357124 s : WilsonFermion5D ComputeTime1/Calls : 4.81357 us
|
||||||
|
Grid : Message : 156.357126 s : WilsonFermion5D ComputeTime2/Calls : 239.16 us
|
||||||
|
Grid : Message : 156.357146 s : Average mflops/s per call : 2.07719e+10
|
||||||
|
Grid : Message : 156.357150 s : Average mflops/s per call per rank : 3.24561e+08
|
||||||
|
Grid : Message : 156.357152 s : Average mflops/s per call per node : 1.29824e+09
|
||||||
|
Grid : Message : 156.357154 s : Average mflops/s per call (full) : 7.35747e+07
|
||||||
|
Grid : Message : 156.357158 s : Average mflops/s per call per rank (full): 1.1496e+06
|
||||||
|
Grid : Message : 156.357161 s : Average mflops/s per call per node (full): 4.59842e+06
|
||||||
|
Grid : Message : 156.357163 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 156.357165 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 156.357166 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 156.357168 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 156.357175 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 156.357176 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 156.375718 s : r_e6.02106
|
||||||
|
Grid : Message : 156.378883 s : r_o6.0211
|
||||||
|
Grid : Message : 156.380335 s : res12.0422
|
||||||
|
Grid : Message : 156.489162 s : norm diff 0
|
||||||
|
Grid : Message : 156.617774 s : norm diff even 0
|
||||||
|
Grid : Message : 156.694536 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1080.64082/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1080
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1080
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 20:58:06 BST 2022
|
||||||
|
epoch 1661025486
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:06:38 BST 2022
|
||||||
|
epoch 1661025998
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffc219f0000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014aa89605000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014aa8923d000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014aa88d4b000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014aa88a21000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014aa88740000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014aa884df000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014aa8958c000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014aa880ff000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014aa869a3000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014aa865d3000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014aa86332000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014aa86207000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x000014aa85e85000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014aa85c4e000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014aa85a36000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014aa85816000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x000014aa85451000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x000014aa8524d000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x000014aa89455000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x000014aa85045000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014aa894c0000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014aa894bb000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014aa84f39000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014aa84d2f000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x000014aa84b2b000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d00000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.412895 s : Grid Layout
|
||||||
|
Grid : Message : 1.412899 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.412905 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.412909 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.428319 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.445373 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.461658 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.461680 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.902912 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.141255 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.141291 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.353326 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.518633 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.626652 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.630634 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.722925 s : *****************************************************************
|
||||||
|
Grid : Message : 13.722949 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.722950 s : *****************************************************************
|
||||||
|
Grid : Message : 13.722951 s : *****************************************************************
|
||||||
|
Grid : Message : 13.722952 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.722953 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.722954 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.722955 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.722956 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.722957 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.722958 s : *****************************************************************
|
||||||
|
Grid : Message : 14.254628 s : Called warmup
|
||||||
|
Grid : Message : 100.327406 s : Called Dw 30000 times in 8.60725e+07 us
|
||||||
|
Grid : Message : 100.327470 s : mflop/s = 7.8153e+07
|
||||||
|
Grid : Message : 100.327472 s : mflop/s per rank = 1.22114e+06
|
||||||
|
Grid : Message : 100.327474 s : mflop/s per node = 4.88456e+06
|
||||||
|
Grid : Message : 100.327476 s : RF GiB/s (base 2) = 158805
|
||||||
|
Grid : Message : 100.327478 s : mem GiB/s (base 2) = 99253.2
|
||||||
|
Grid : Message : 100.328051 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 100.337927 s : #### Dhop calls report
|
||||||
|
Grid : Message : 100.337935 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 100.337943 s : WilsonFermion5D TotalTime /Calls : 1435.69 us
|
||||||
|
Grid : Message : 100.337946 s : WilsonFermion5D CommTime /Calls : 996.547 us
|
||||||
|
Grid : Message : 100.337949 s : WilsonFermion5D FaceTime /Calls : 217.079 us
|
||||||
|
Grid : Message : 100.337953 s : WilsonFermion5D ComputeTime1/Calls : 2.78067 us
|
||||||
|
Grid : Message : 100.337955 s : WilsonFermion5D ComputeTime2/Calls : 234.472 us
|
||||||
|
Grid : Message : 100.337971 s : Average mflops/s per call : 3.63872e+10
|
||||||
|
Grid : Message : 100.337974 s : Average mflops/s per call per rank : 5.68549e+08
|
||||||
|
Grid : Message : 100.337976 s : Average mflops/s per call per node : 2.2742e+09
|
||||||
|
Grid : Message : 100.337980 s : Average mflops/s per call (full) : 7.95104e+07
|
||||||
|
Grid : Message : 100.337982 s : Average mflops/s per call per rank (full): 1.24235e+06
|
||||||
|
Grid : Message : 100.337986 s : Average mflops/s per call per node (full): 4.9694e+06
|
||||||
|
Grid : Message : 100.337988 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 100.337990 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 100.337992 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 100.337995 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 100.337998 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 100.338000 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 109.354730 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 109.355200 s : Called DwDag
|
||||||
|
Grid : Message : 109.355210 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 109.404420 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 109.435430 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 109.565940 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 109.123204 s : src_e0.5
|
||||||
|
Grid : Message : 109.194082 s : src_o0.5
|
||||||
|
Grid : Message : 109.211743 s : *********************************************************
|
||||||
|
Grid : Message : 109.211749 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 109.211751 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 109.211754 s : * SINGLE precision
|
||||||
|
Grid : Message : 109.211756 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 109.211759 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 109.211761 s : *********************************************************
|
||||||
|
Grid : Message : 155.351395 s : Deo mflop/s = 7.29132e+07
|
||||||
|
Grid : Message : 155.351424 s : Deo mflop/s per rank 1.13927e+06
|
||||||
|
Grid : Message : 155.351427 s : Deo mflop/s per node 4.55708e+06
|
||||||
|
Grid : Message : 155.351433 s : #### Dhop calls report
|
||||||
|
Grid : Message : 155.351436 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 155.351440 s : WilsonFermion5D TotalTime /Calls : 1537.8 us
|
||||||
|
Grid : Message : 155.351445 s : WilsonFermion5D CommTime /Calls : 1037.77 us
|
||||||
|
Grid : Message : 155.351449 s : WilsonFermion5D FaceTime /Calls : 285.044 us
|
||||||
|
Grid : Message : 155.351453 s : WilsonFermion5D ComputeTime1/Calls : 4.8771 us
|
||||||
|
Grid : Message : 155.351457 s : WilsonFermion5D ComputeTime2/Calls : 237.861 us
|
||||||
|
Grid : Message : 155.351481 s : Average mflops/s per call : 2.07287e+10
|
||||||
|
Grid : Message : 155.351485 s : Average mflops/s per call per rank : 3.23886e+08
|
||||||
|
Grid : Message : 155.351488 s : Average mflops/s per call per node : 1.29554e+09
|
||||||
|
Grid : Message : 155.351492 s : Average mflops/s per call (full) : 7.42306e+07
|
||||||
|
Grid : Message : 155.351496 s : Average mflops/s per call per rank (full): 1.15985e+06
|
||||||
|
Grid : Message : 155.351500 s : Average mflops/s per call per node (full): 4.63942e+06
|
||||||
|
Grid : Message : 155.351504 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 155.351506 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 155.351508 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 155.351511 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 155.351513 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 155.351515 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 155.370290 s : r_e6.02106
|
||||||
|
Grid : Message : 155.372244 s : r_o6.0211
|
||||||
|
Grid : Message : 155.373660 s : res12.0422
|
||||||
|
Grid : Message : 155.495172 s : norm diff 0
|
||||||
|
Grid : Message : 155.622362 s : norm diff even 0
|
||||||
|
Grid : Message : 155.695812 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1095.64087/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1095
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1095
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:03:53 BST 2022
|
||||||
|
epoch 1661025833
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:12:23 BST 2022
|
||||||
|
epoch 1661026343
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffdef5db000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000152bce209000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000152bcde41000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000152bcd94f000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000152bcd625000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000152bcd344000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000152bcd0e3000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000152bce190000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000152bccd03000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152bcb5a7000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000152bcb1d7000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152bcaf36000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000152bcae0b000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x0000152bcaa89000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152bca852000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152bca63a000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152bca41a000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x0000152bca055000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x0000152bc9e51000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x0000152bce059000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x0000152bc9c49000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000152bce0c4000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000152bce0bf000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000152bc9b3d000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152bc9933000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x0000152bc972f000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147320000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.574553 s : Grid Layout
|
||||||
|
Grid : Message : 1.574555 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.574559 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.574561 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.590560 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.602336 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.619266 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.619291 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.883640 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.117383 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.117419 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.594282 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.809615 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.954788 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.965668 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.965128 s : *****************************************************************
|
||||||
|
Grid : Message : 13.965152 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.965153 s : *****************************************************************
|
||||||
|
Grid : Message : 13.965154 s : *****************************************************************
|
||||||
|
Grid : Message : 13.965155 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.965156 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.965157 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.965159 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.965160 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.965161 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.965162 s : *****************************************************************
|
||||||
|
Grid : Message : 14.515202 s : Called warmup
|
||||||
|
Grid : Message : 99.730150 s : Called Dw 30000 times in 8.52149e+07 us
|
||||||
|
Grid : Message : 99.730204 s : mflop/s = 7.89395e+07
|
||||||
|
Grid : Message : 99.730206 s : mflop/s per rank = 1.23343e+06
|
||||||
|
Grid : Message : 99.730208 s : mflop/s per node = 4.93372e+06
|
||||||
|
Grid : Message : 99.730210 s : RF GiB/s (base 2) = 160403
|
||||||
|
Grid : Message : 99.730212 s : mem GiB/s (base 2) = 100252
|
||||||
|
Grid : Message : 99.730784 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 99.740621 s : #### Dhop calls report
|
||||||
|
Grid : Message : 99.740628 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 99.740631 s : WilsonFermion5D TotalTime /Calls : 1421.72 us
|
||||||
|
Grid : Message : 99.740633 s : WilsonFermion5D CommTime /Calls : 984.801 us
|
||||||
|
Grid : Message : 99.740635 s : WilsonFermion5D FaceTime /Calls : 215.72 us
|
||||||
|
Grid : Message : 99.740637 s : WilsonFermion5D ComputeTime1/Calls : 2.65594 us
|
||||||
|
Grid : Message : 99.740639 s : WilsonFermion5D ComputeTime2/Calls : 233.727 us
|
||||||
|
Grid : Message : 99.740655 s : Average mflops/s per call : 3.59268e+10
|
||||||
|
Grid : Message : 99.740658 s : Average mflops/s per call per rank : 5.61356e+08
|
||||||
|
Grid : Message : 99.740660 s : Average mflops/s per call per node : 2.24542e+09
|
||||||
|
Grid : Message : 99.740662 s : Average mflops/s per call (full) : 8.02916e+07
|
||||||
|
Grid : Message : 99.740665 s : Average mflops/s per call per rank (full): 1.25456e+06
|
||||||
|
Grid : Message : 99.740667 s : Average mflops/s per call per node (full): 5.01823e+06
|
||||||
|
Grid : Message : 99.740669 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 99.740670 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 99.740672 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 99.740673 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 99.740675 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 99.740679 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 108.466783 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 108.466816 s : Called DwDag
|
||||||
|
Grid : Message : 108.466817 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 108.470193 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 108.473428 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 108.486838 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 108.550312 s : src_e0.5
|
||||||
|
Grid : Message : 108.623836 s : src_o0.5
|
||||||
|
Grid : Message : 108.640541 s : *********************************************************
|
||||||
|
Grid : Message : 108.640545 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 108.640546 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 108.640548 s : * SINGLE precision
|
||||||
|
Grid : Message : 108.640553 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 108.640555 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 108.640556 s : *********************************************************
|
||||||
|
Grid : Message : 154.233908 s : Deo mflop/s = 7.37872e+07
|
||||||
|
Grid : Message : 154.233941 s : Deo mflop/s per rank 1.15293e+06
|
||||||
|
Grid : Message : 154.233943 s : Deo mflop/s per node 4.6117e+06
|
||||||
|
Grid : Message : 154.233946 s : #### Dhop calls report
|
||||||
|
Grid : Message : 154.233948 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 154.233950 s : WilsonFermion5D TotalTime /Calls : 1519.59 us
|
||||||
|
Grid : Message : 154.233952 s : WilsonFermion5D CommTime /Calls : 1019.64 us
|
||||||
|
Grid : Message : 154.233954 s : WilsonFermion5D FaceTime /Calls : 288.201 us
|
||||||
|
Grid : Message : 154.233956 s : WilsonFermion5D ComputeTime1/Calls : 4.91837 us
|
||||||
|
Grid : Message : 154.233958 s : WilsonFermion5D ComputeTime2/Calls : 236.348 us
|
||||||
|
Grid : Message : 154.233977 s : Average mflops/s per call : 2.07539e+10
|
||||||
|
Grid : Message : 154.233980 s : Average mflops/s per call per rank : 3.24279e+08
|
||||||
|
Grid : Message : 154.233982 s : Average mflops/s per call per node : 1.29712e+09
|
||||||
|
Grid : Message : 154.233984 s : Average mflops/s per call (full) : 7.51203e+07
|
||||||
|
Grid : Message : 154.233986 s : Average mflops/s per call per rank (full): 1.17375e+06
|
||||||
|
Grid : Message : 154.233988 s : Average mflops/s per call per node (full): 4.69502e+06
|
||||||
|
Grid : Message : 154.233991 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 154.233992 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 154.233993 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 154.233994 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 154.233995 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 154.233996 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 154.253979 s : r_e6.02106
|
||||||
|
Grid : Message : 154.255883 s : r_o6.0211
|
||||||
|
Grid : Message : 154.257289 s : res12.0422
|
||||||
|
Grid : Message : 154.364123 s : norm diff 0
|
||||||
|
Grid : Message : 154.496590 s : norm diff even 0
|
||||||
|
Grid : Message : 154.572879 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1110.64091/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1110
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1110
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:09:39 BST 2022
|
||||||
|
epoch 1661026179
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:18:10 BST 2022
|
||||||
|
epoch 1661026690
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/env
Normal file
File diff suppressed because one or more lines are too long
26
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/ldd
Normal file
26
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/ldd
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
linux-vdso.so.1 (0x00007ffe04b26000)
|
||||||
|
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ffbc78a000)
|
||||||
|
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ffbc3c2000)
|
||||||
|
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ffbbed0000)
|
||||||
|
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ffbbba6000)
|
||||||
|
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ffbb8c5000)
|
||||||
|
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ffbb664000)
|
||||||
|
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ffbc711000)
|
||||||
|
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ffbb284000)
|
||||||
|
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ffb9b28000)
|
||||||
|
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ffb9758000)
|
||||||
|
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ffb94b7000)
|
||||||
|
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ffb938c000)
|
||||||
|
libm.so.6 => /lib64/libm.so.6 (0x000014ffb900a000)
|
||||||
|
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ffb8dd3000)
|
||||||
|
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ffb8bbb000)
|
||||||
|
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ffb899b000)
|
||||||
|
libc.so.6 => /lib64/libc.so.6 (0x000014ffb85d6000)
|
||||||
|
libdl.so.2 => /lib64/libdl.so.2 (0x000014ffb83d2000)
|
||||||
|
/lib64/ld-linux-x86-64.so.2 (0x000014ffbc5da000)
|
||||||
|
librt.so.1 => /lib64/librt.so.1 (0x000014ffb81ca000)
|
||||||
|
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ffbc645000)
|
||||||
|
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ffbc640000)
|
||||||
|
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ffb80be000)
|
||||||
|
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ffb7eb4000)
|
||||||
|
libutil.so.1 => /lib64/libutil.so.1 (0x000014ffb7cb0000)
|
286
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/log
Normal file
286
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/log
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
|
||||||
|
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
|
||||||
|
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
|
||||||
|
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
|
||||||
|
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 3
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
OPENMPI detected
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 0 device 0 bus id: 0000:03:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
local rank 2 device 0 bus id: 0000:84:00.0
|
||||||
|
local rank 1 device 0 bus id: 0000:44:00.0
|
||||||
|
local rank 3 device 0 bus id: 0000:C4:00.0
|
||||||
|
SharedMemoryMpi: World communicator of size 64
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146500000000 for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
Grid : Message : 1.503072 s : Grid Layout
|
||||||
|
Grid : Message : 1.503076 s : Global lattice size : 48 48 48 96
|
||||||
|
Grid : Message : 1.503081 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 1.503083 s : MPI tasks : 2 2 2 8
|
||||||
|
Grid : Message : 1.518479 s : Making s innermost grids
|
||||||
|
Grid : Message : 1.535611 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 1.551229 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 1.551252 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 1.805667 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 2.356490 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 2.357030 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 7.303785 s : Initialised RNGs
|
||||||
|
Grid : Message : 8.385261 s : Drawing gauge field
|
||||||
|
Grid : Message : 8.496485 s : Random gauge initialised
|
||||||
|
Grid : Message : 8.509783 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 13.609539 s : *****************************************************************
|
||||||
|
Grid : Message : 13.609564 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 13.609566 s : *****************************************************************
|
||||||
|
Grid : Message : 13.609568 s : *****************************************************************
|
||||||
|
Grid : Message : 13.609573 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 13.609575 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 13.609577 s : * VComplexF size is 64 B
|
||||||
|
Grid : Message : 13.609579 s : * SINGLE precision
|
||||||
|
Grid : Message : 13.609582 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 13.609584 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 13.609586 s : *****************************************************************
|
||||||
|
Grid : Message : 14.155991 s : Called warmup
|
||||||
|
Grid : Message : 98.420612 s : Called Dw 30000 times in 8.42644e+07 us
|
||||||
|
Grid : Message : 98.420675 s : mflop/s = 7.983e+07
|
||||||
|
Grid : Message : 98.420677 s : mflop/s per rank = 1.24734e+06
|
||||||
|
Grid : Message : 98.420679 s : mflop/s per node = 4.98937e+06
|
||||||
|
Grid : Message : 98.420681 s : RF GiB/s (base 2) = 162213
|
||||||
|
Grid : Message : 98.420683 s : mem GiB/s (base 2) = 101383
|
||||||
|
Grid : Message : 98.421254 s : norm diff 1.05775e-13
|
||||||
|
Grid : Message : 98.431170 s : #### Dhop calls report
|
||||||
|
Grid : Message : 98.431178 s : WilsonFermion5D Number of DhopEO Calls : 60002
|
||||||
|
Grid : Message : 98.431182 s : WilsonFermion5D TotalTime /Calls : 1405.63 us
|
||||||
|
Grid : Message : 98.431184 s : WilsonFermion5D CommTime /Calls : 961.451 us
|
||||||
|
Grid : Message : 98.431186 s : WilsonFermion5D FaceTime /Calls : 222.433 us
|
||||||
|
Grid : Message : 98.431188 s : WilsonFermion5D ComputeTime1/Calls : 2.80214 us
|
||||||
|
Grid : Message : 98.431190 s : WilsonFermion5D ComputeTime2/Calls : 234.1 us
|
||||||
|
Grid : Message : 98.431212 s : Average mflops/s per call : 3.60793e+10
|
||||||
|
Grid : Message : 98.431216 s : Average mflops/s per call per rank : 5.63738e+08
|
||||||
|
Grid : Message : 98.431218 s : Average mflops/s per call per node : 2.25495e+09
|
||||||
|
Grid : Message : 98.431220 s : Average mflops/s per call (full) : 8.12107e+07
|
||||||
|
Grid : Message : 98.431224 s : Average mflops/s per call per rank (full): 1.26892e+06
|
||||||
|
Grid : Message : 98.431226 s : Average mflops/s per call per node (full): 5.07567e+06
|
||||||
|
Grid : Message : 98.431229 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 98.431230 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 98.431235 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 98.431239 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 98.431240 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 98.431241 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 107.161203 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 107.161230 s : Called DwDag
|
||||||
|
Grid : Message : 107.161231 s : norm dag result 12.0422
|
||||||
|
Grid : Message : 107.163717 s : norm dag ref 12.0422
|
||||||
|
Grid : Message : 107.166717 s : norm dag diff 7.28899e-14
|
||||||
|
Grid : Message : 107.181064 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 107.248613 s : src_e0.5
|
||||||
|
Grid : Message : 107.314227 s : src_o0.5
|
||||||
|
Grid : Message : 107.331787 s : *********************************************************
|
||||||
|
Grid : Message : 107.331790 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||||
|
Grid : Message : 107.331792 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 107.331794 s : * SINGLE precision
|
||||||
|
Grid : Message : 107.331795 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 107.331796 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 107.331797 s : *********************************************************
|
||||||
|
Grid : Message : 152.337360 s : Deo mflop/s = 7.47496e+07
|
||||||
|
Grid : Message : 152.337387 s : Deo mflop/s per rank 1.16796e+06
|
||||||
|
Grid : Message : 152.337390 s : Deo mflop/s per node 4.67185e+06
|
||||||
|
Grid : Message : 152.337396 s : #### Dhop calls report
|
||||||
|
Grid : Message : 152.337399 s : WilsonFermion5D Number of DhopEO Calls : 30001
|
||||||
|
Grid : Message : 152.337402 s : WilsonFermion5D TotalTime /Calls : 1500 us
|
||||||
|
Grid : Message : 152.337405 s : WilsonFermion5D CommTime /Calls : 1002.91 us
|
||||||
|
Grid : Message : 152.337408 s : WilsonFermion5D FaceTime /Calls : 282.963 us
|
||||||
|
Grid : Message : 152.337410 s : WilsonFermion5D ComputeTime1/Calls : 4.71911 us
|
||||||
|
Grid : Message : 152.337412 s : WilsonFermion5D ComputeTime2/Calls : 237.647 us
|
||||||
|
Grid : Message : 152.337435 s : Average mflops/s per call : 2.07759e+10
|
||||||
|
Grid : Message : 152.337439 s : Average mflops/s per call per rank : 3.24624e+08
|
||||||
|
Grid : Message : 152.337441 s : Average mflops/s per call per node : 1.29849e+09
|
||||||
|
Grid : Message : 152.337445 s : Average mflops/s per call (full) : 7.61013e+07
|
||||||
|
Grid : Message : 152.337448 s : Average mflops/s per call per rank (full): 1.18908e+06
|
||||||
|
Grid : Message : 152.337451 s : Average mflops/s per call per node (full): 4.75633e+06
|
||||||
|
Grid : Message : 152.337453 s : WilsonFermion5D Stencil
|
||||||
|
Grid : Message : 152.337456 s : WilsonFermion5D StencilEven
|
||||||
|
Grid : Message : 152.337457 s : WilsonFermion5D StencilOdd
|
||||||
|
Grid : Message : 152.337459 s : WilsonFermion5D Stencil Reporti()
|
||||||
|
Grid : Message : 152.337462 s : WilsonFermion5D StencilEven Reporti()
|
||||||
|
Grid : Message : 152.337463 s : WilsonFermion5D StencilOdd Reporti()
|
||||||
|
Grid : Message : 152.358219 s : r_e6.02106
|
||||||
|
Grid : Message : 152.359968 s : r_o6.0211
|
||||||
|
Grid : Message : 152.361373 s : res12.0422
|
||||||
|
Grid : Message : 152.467780 s : norm diff 0
|
||||||
|
Grid : Message : 152.609427 s : norm diff even 0
|
||||||
|
Grid : Message : 152.675745 s : norm diff odd 0
|
1
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/nodes
Normal file
1
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/nodes
Normal file
@ -0,0 +1 @@
|
|||||||
|
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
|
112
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/script
Executable file
112
2-racks/size-C0/16-nodes/job/power-16A-1125.64095/script
Executable file
@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC1091,SC2050,SC2170
|
||||||
|
|
||||||
|
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
|
||||||
|
|
||||||
|
#SBATCH -J power-16A-1125
|
||||||
|
#SBATCH -A dp207
|
||||||
|
#SBATCH -t 48:00:00
|
||||||
|
#SBATCH --nodes=16
|
||||||
|
#SBATCH --ntasks=64
|
||||||
|
#SBATCH --ntasks-per-node=4
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --partition=gpu
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --output=%x.%j.out
|
||||||
|
#SBATCH --error=%x.%j.err
|
||||||
|
#SBATCH --reservation=dc-port1_61
|
||||||
|
#SBATCH --qos=reservation
|
||||||
|
#SBATCH --no-requeue
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# OpenMP/OpenMPI/UCX environment ###############################################
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export OMPI_MCA_pml=ucx
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
# IO environment ###############################################################
|
||||||
|
|
||||||
|
if [ 16 -eq 1 ]; then
|
||||||
|
export OMPI_MCA_io=ompio
|
||||||
|
else
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
fi
|
||||||
|
export OMPI_MCA_btl_openib_allow_ib=true
|
||||||
|
export OMPI_MCA_btl_openib_device_type=infiniband
|
||||||
|
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
|
||||||
|
|
||||||
|
# load environment #############################################################
|
||||||
|
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
|
||||||
|
source "${env_dir}/env-base.sh"
|
||||||
|
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
|
||||||
|
source "${env_dir}/env-gpu.sh"
|
||||||
|
else
|
||||||
|
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
spack load sshpass
|
||||||
|
|
||||||
|
# application and parameters ###################################################
|
||||||
|
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
|
||||||
|
opt=('--comms-overlap' '--comms-concurrent')
|
||||||
|
par=''
|
||||||
|
|
||||||
|
# collect job information ######################################################
|
||||||
|
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
|
||||||
|
mkdir -p "${job_info_dir}"
|
||||||
|
|
||||||
|
date > "${job_info_dir}/start-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
|
||||||
|
set > "${job_info_dir}/env"
|
||||||
|
ldd ${app} > "${job_info_dir}/ldd"
|
||||||
|
md5sum ${app} > "${job_info_dir}/app-hash"
|
||||||
|
readelf -a ${app} > "${job_info_dir}/elf"
|
||||||
|
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
|
||||||
|
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
|
||||||
|
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
|
||||||
|
|
||||||
|
# GPU frequency control ########################################################
|
||||||
|
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
|
||||||
|
freq=1125
|
||||||
|
|
||||||
|
# set frequency
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
|
||||||
|
done
|
||||||
|
# start NVIDIA SMI monitoring
|
||||||
|
tmp=$(mktemp)
|
||||||
|
sleep 1
|
||||||
|
coproc nvidia-smi dmon -o DT &> "${tmp}"
|
||||||
|
|
||||||
|
# run! #########################################################################
|
||||||
|
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
|
||||||
|
./gpu-mpi-wrapper.sh \
|
||||||
|
${app} "${par}" "${opt[@]}" \
|
||||||
|
--mpi 2.2.2.8 \
|
||||||
|
--accelerator-threads 8 \
|
||||||
|
--grid 48.48.48.96 \
|
||||||
|
--shm 2048 &> "${job_info_dir}/log"
|
||||||
|
|
||||||
|
# if we reach that point the application exited successfully ###################
|
||||||
|
touch "${job_info_dir}/success"
|
||||||
|
date > "${job_info_dir}/end-date"
|
||||||
|
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
|
||||||
|
|
||||||
|
# reset GPUS ###################################################################
|
||||||
|
# stop monitoring
|
||||||
|
kill -INT "${COPROC_PID}"
|
||||||
|
|
||||||
|
# make monitoring DB
|
||||||
|
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
|
||||||
|
|
||||||
|
# reset clocks
|
||||||
|
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
|
||||||
|
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
|
||||||
|
done
|
||||||
|
################################################################################
|
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:15:27 BST 2022
|
||||||
|
epoch 1661026527
|
@ -0,0 +1 @@
|
|||||||
|
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
|
4310
2-racks/size-C0/16-nodes/job/power-16A-1140.64100/elf
Normal file
4310
2-racks/size-C0/16-nodes/job/power-16A-1140.64100/elf
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
Sat Aug 20 21:23:53 BST 2022
|
||||||
|
epoch 1661027033
|
2062
2-racks/size-C0/16-nodes/job/power-16A-1140.64100/env
Normal file
2062
2-racks/size-C0/16-nodes/job/power-16A-1140.64100/env
Normal file
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user