Initial commit

This commit is contained in:
2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions

BIN
2-racks/rack-power.db Normal file

Binary file not shown.

View File

@ -0,0 +1,5 @@
nnodes : 16
ntasks : 64
partition : gpu
mpi-geom : 2.2.2.8
grid-geom : 48.48.48.96

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
numa=${lrank}
cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))"
places="$(( lrank*16 )):$(( (lrank+1)*16 ))"
BINDING="taskset -c ${cpus} numactl -m ${numa}"
export OMP_PLACES=${places}
echo "$(hostname) - ${lrank} binding='${BINDING}'"
${BINDING} "$@"

View File

@ -0,0 +1 @@
../dwf_fp32.tok

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
numa1=$(( 2 * lrank))
numa2=$(( 2 * lrank + 1 ))
netdev=mlx5_${lrank}:1
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
export UCX_NET_DEVICES=${netdev}
BINDING="--interleave=$numa1,$numa2"
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
numactl ${BINDING} "$@"

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:25:12 BST 2022
epoch 1661023512

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffef5f3f000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015459e0bd000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015459dcf5000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015459d803000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015459d4d9000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015459d1f8000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015459cf97000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015459e044000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015459cbb7000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000015459b45b000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015459b08b000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015459adea000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015459acbf000)
libm.so.6 => /lib64/libm.so.6 (0x000015459a93d000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015459a706000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015459a4ee000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000015459a2ce000)
libc.so.6 => /lib64/libc.so.6 (0x0000154599f09000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000154599d05000)
/lib64/ld-linux-x86-64.so.2 (0x000015459df0d000)
librt.so.1 => /lib64/librt.so.1 (0x0000154599afd000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015459df78000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015459df73000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001545999f1000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001545997e7000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001545995e3000)

View File

@ -0,0 +1,286 @@
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ea00000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.499143 s : Grid Layout
Grid : Message : 1.499148 s : Global lattice size : 48 48 48 96
Grid : Message : 1.499155 s : OpenMP threads : 4
Grid : Message : 1.499157 s : MPI tasks : 2 2 2 8
Grid : Message : 1.515541 s : Making s innermost grids
Grid : Message : 1.532470 s : Initialising 4d RNG
Grid : Message : 1.550455 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.550491 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.937366 s : Initialising 5d RNG
Grid : Message : 2.163040 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.163078 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.467109 s : Initialised RNGs
Grid : Message : 8.261272 s : Drawing gauge field
Grid : Message : 8.380110 s : Random gauge initialised
Grid : Message : 8.388989 s : Setting up Cshift based reference
Grid : Message : 13.599668 s : *****************************************************************
Grid : Message : 13.599694 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.599696 s : *****************************************************************
Grid : Message : 13.599700 s : *****************************************************************
Grid : Message : 13.599702 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.599705 s : * Vectorising space-time by 8
Grid : Message : 13.599708 s : * VComplexF size is 64 B
Grid : Message : 13.599710 s : * SINGLE precision
Grid : Message : 13.599712 s : * Using Overlapped Comms/Compute
Grid : Message : 13.599716 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.599719 s : *****************************************************************
Grid : Message : 14.992290 s : Called warmup
Grid : Message : 104.236264 s : Called Dw 30000 times in 9.01365e+07 us
Grid : Message : 104.236329 s : mflop/s = 7.46293e+07
Grid : Message : 104.236331 s : mflop/s per rank = 1.16608e+06
Grid : Message : 104.236333 s : mflop/s per node = 4.66433e+06
Grid : Message : 104.236335 s : RF GiB/s (base 2) = 151645
Grid : Message : 104.236337 s : mem GiB/s (base 2) = 94778.1
Grid : Message : 104.236908 s : norm diff 1.05775e-13
Grid : Message : 104.247209 s : #### Dhop calls report
Grid : Message : 104.247215 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 104.247219 s : WilsonFermion5D TotalTime /Calls : 1503.52 us
Grid : Message : 104.247221 s : WilsonFermion5D CommTime /Calls : 1054.2 us
Grid : Message : 104.247223 s : WilsonFermion5D FaceTime /Calls : 225.375 us
Grid : Message : 104.247225 s : WilsonFermion5D ComputeTime1/Calls : 3.01152 us
Grid : Message : 104.247227 s : WilsonFermion5D ComputeTime2/Calls : 236.377 us
Grid : Message : 104.247294 s : Average mflops/s per call : 3.59587e+10
Grid : Message : 104.247300 s : Average mflops/s per call per rank : 5.61855e+08
Grid : Message : 104.247303 s : Average mflops/s per call per node : 2.24742e+09
Grid : Message : 104.247305 s : Average mflops/s per call (full) : 7.59233e+07
Grid : Message : 104.247307 s : Average mflops/s per call per rank (full): 1.1863e+06
Grid : Message : 104.247309 s : Average mflops/s per call per node (full): 4.7452e+06
Grid : Message : 104.247311 s : WilsonFermion5D Stencil
Grid : Message : 104.247312 s : WilsonFermion5D StencilEven
Grid : Message : 104.247313 s : WilsonFermion5D StencilOdd
Grid : Message : 104.247314 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 104.247315 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 104.247316 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 112.998074 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 112.998099 s : Called DwDag
Grid : Message : 112.998100 s : norm dag result 12.0422
Grid : Message : 113.585000 s : norm dag ref 12.0422
Grid : Message : 113.380300 s : norm dag diff 7.28899e-14
Grid : Message : 113.140290 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 113.790730 s : src_e0.5
Grid : Message : 113.153215 s : src_o0.5
Grid : Message : 113.170341 s : *********************************************************
Grid : Message : 113.170346 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 113.170347 s : * Vectorising space-time by 8
Grid : Message : 113.170353 s : * SINGLE precision
Grid : Message : 113.170356 s : * Using Overlapped Comms/Compute
Grid : Message : 113.170357 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 113.170361 s : *********************************************************
Grid : Message : 161.702832 s : Deo mflop/s = 6.93159e+07
Grid : Message : 161.702861 s : Deo mflop/s per rank 1.08306e+06
Grid : Message : 161.702863 s : Deo mflop/s per node 4.33224e+06
Grid : Message : 161.702866 s : #### Dhop calls report
Grid : Message : 161.702868 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 161.702870 s : WilsonFermion5D TotalTime /Calls : 1617.57 us
Grid : Message : 161.702872 s : WilsonFermion5D CommTime /Calls : 1105.14 us
Grid : Message : 161.702874 s : WilsonFermion5D FaceTime /Calls : 294.218 us
Grid : Message : 161.702876 s : WilsonFermion5D ComputeTime1/Calls : 4.85114 us
Grid : Message : 161.702878 s : WilsonFermion5D ComputeTime2/Calls : 241.569 us
Grid : Message : 161.702900 s : Average mflops/s per call : 2.0686e+10
Grid : Message : 161.702904 s : Average mflops/s per call per rank : 3.23219e+08
Grid : Message : 161.702906 s : Average mflops/s per call per node : 1.29288e+09
Grid : Message : 161.702908 s : Average mflops/s per call (full) : 7.05701e+07
Grid : Message : 161.702912 s : Average mflops/s per call per rank (full): 1.10266e+06
Grid : Message : 161.702914 s : Average mflops/s per call per node (full): 4.41063e+06
Grid : Message : 161.702920 s : WilsonFermion5D Stencil
Grid : Message : 161.702922 s : WilsonFermion5D StencilEven
Grid : Message : 161.702923 s : WilsonFermion5D StencilOdd
Grid : Message : 161.702926 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 161.702927 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 161.702928 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 161.722751 s : r_e6.02106
Grid : Message : 161.724439 s : r_o6.0211
Grid : Message : 161.725861 s : res12.0422
Grid : Message : 161.827558 s : norm diff 0
Grid : Message : 161.972191 s : norm diff even 0
Grid : Message : 162.433730 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1005
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1005
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:22:21 BST 2022
epoch 1661023341

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:37:35 BST 2022
epoch 1661024255

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffff456d000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154c9a375000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154c99fad000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154c99abb000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154c99791000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154c994b0000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154c9924f000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154c9a2fc000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154c98e6f000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154c97713000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154c97343000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154c970a2000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154c96f77000)
libm.so.6 => /lib64/libm.so.6 (0x0000154c96bf5000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154c969be000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154c967a6000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154c96586000)
libc.so.6 => /lib64/libc.so.6 (0x0000154c961c1000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000154c95fbd000)
/lib64/ld-linux-x86-64.so.2 (0x0000154c9a1c5000)
librt.so.1 => /lib64/librt.so.1 (0x0000154c95db5000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154c9a230000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154c9a22b000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154c95ca9000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154c95a9f000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000154c9589b000)

View File

@ -0,0 +1,286 @@
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14d8e0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.312638 s : Grid Layout
Grid : Message : 1.312643 s : Global lattice size : 48 48 48 96
Grid : Message : 1.312650 s : OpenMP threads : 4
Grid : Message : 1.312652 s : MPI tasks : 2 2 2 8
Grid : Message : 1.327971 s : Making s innermost grids
Grid : Message : 1.344471 s : Initialising 4d RNG
Grid : Message : 1.361018 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.361045 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.837887 s : Initialising 5d RNG
Grid : Message : 2.844490 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.845110 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.428202 s : Initialised RNGs
Grid : Message : 8.439960 s : Drawing gauge field
Grid : Message : 8.560999 s : Random gauge initialised
Grid : Message : 8.573339 s : Setting up Cshift based reference
Grid : Message : 13.695651 s : *****************************************************************
Grid : Message : 13.695676 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.695677 s : *****************************************************************
Grid : Message : 13.695678 s : *****************************************************************
Grid : Message : 13.695679 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.695680 s : * Vectorising space-time by 8
Grid : Message : 13.695681 s : * VComplexF size is 64 B
Grid : Message : 13.695682 s : * SINGLE precision
Grid : Message : 13.695684 s : * Using Overlapped Comms/Compute
Grid : Message : 13.695685 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.695686 s : *****************************************************************
Grid : Message : 14.234933 s : Called warmup
Grid : Message : 103.428452 s : Called Dw 30000 times in 8.91932e+07 us
Grid : Message : 103.428517 s : mflop/s = 7.54186e+07
Grid : Message : 103.428519 s : mflop/s per rank = 1.17842e+06
Grid : Message : 103.428521 s : mflop/s per node = 4.71366e+06
Grid : Message : 103.428523 s : RF GiB/s (base 2) = 153249
Grid : Message : 103.428525 s : mem GiB/s (base 2) = 95780.5
Grid : Message : 103.429097 s : norm diff 1.05775e-13
Grid : Message : 103.439111 s : #### Dhop calls report
Grid : Message : 103.439118 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 103.439122 s : WilsonFermion5D TotalTime /Calls : 1487.69 us
Grid : Message : 103.439124 s : WilsonFermion5D CommTime /Calls : 1041.46 us
Grid : Message : 103.439126 s : WilsonFermion5D FaceTime /Calls : 222.459 us
Grid : Message : 103.439128 s : WilsonFermion5D ComputeTime1/Calls : 2.85969 us
Grid : Message : 103.439130 s : WilsonFermion5D ComputeTime2/Calls : 236.325 us
Grid : Message : 103.439201 s : Average mflops/s per call : 3.60313e+10
Grid : Message : 103.439207 s : Average mflops/s per call per rank : 5.62989e+08
Grid : Message : 103.439209 s : Average mflops/s per call per node : 2.25196e+09
Grid : Message : 103.439211 s : Average mflops/s per call (full) : 7.67311e+07
Grid : Message : 103.439213 s : Average mflops/s per call per rank (full): 1.19892e+06
Grid : Message : 103.439215 s : Average mflops/s per call per node (full): 4.7957e+06
Grid : Message : 103.439217 s : WilsonFermion5D Stencil
Grid : Message : 103.439218 s : WilsonFermion5D StencilEven
Grid : Message : 103.439219 s : WilsonFermion5D StencilOdd
Grid : Message : 103.439220 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 103.439221 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 103.439222 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 112.177904 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 112.177939 s : Called DwDag
Grid : Message : 112.177940 s : norm dag result 12.0422
Grid : Message : 112.186235 s : norm dag ref 12.0422
Grid : Message : 112.189309 s : norm dag diff 7.28899e-14
Grid : Message : 112.200523 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 112.263704 s : src_e0.5
Grid : Message : 112.335429 s : src_o0.5
Grid : Message : 112.352238 s : *********************************************************
Grid : Message : 112.352244 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 112.352246 s : * Vectorising space-time by 8
Grid : Message : 112.352248 s : * SINGLE precision
Grid : Message : 112.352250 s : * Using Overlapped Comms/Compute
Grid : Message : 112.352253 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 112.352254 s : *********************************************************
Grid : Message : 160.328889 s : Deo mflop/s = 7.01193e+07
Grid : Message : 160.328922 s : Deo mflop/s per rank 1.09561e+06
Grid : Message : 160.328924 s : Deo mflop/s per node 4.38246e+06
Grid : Message : 160.328927 s : #### Dhop calls report
Grid : Message : 160.328929 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 160.328931 s : WilsonFermion5D TotalTime /Calls : 1599.04 us
Grid : Message : 160.328933 s : WilsonFermion5D CommTime /Calls : 1088.05 us
Grid : Message : 160.328935 s : WilsonFermion5D FaceTime /Calls : 294.436 us
Grid : Message : 160.328937 s : WilsonFermion5D ComputeTime1/Calls : 4.78577 us
Grid : Message : 160.328939 s : WilsonFermion5D ComputeTime2/Calls : 241.411 us
Grid : Message : 160.328966 s : Average mflops/s per call : 2.07599e+10
Grid : Message : 160.328971 s : Average mflops/s per call per rank : 3.24373e+08
Grid : Message : 160.328975 s : Average mflops/s per call per node : 1.29749e+09
Grid : Message : 160.328980 s : Average mflops/s per call (full) : 7.13878e+07
Grid : Message : 160.328983 s : Average mflops/s per call per rank (full): 1.11543e+06
Grid : Message : 160.328987 s : Average mflops/s per call per node (full): 4.46174e+06
Grid : Message : 160.328989 s : WilsonFermion5D Stencil
Grid : Message : 160.328990 s : WilsonFermion5D StencilEven
Grid : Message : 160.328992 s : WilsonFermion5D StencilOdd
Grid : Message : 160.328995 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 160.328997 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 160.329000 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 160.348014 s : r_e6.02106
Grid : Message : 160.350033 s : r_o6.0211
Grid : Message : 160.351497 s : res12.0422
Grid : Message : 160.466811 s : norm diff 0
Grid : Message : 160.599190 s : norm diff even 0
Grid : Message : 160.669838 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1020
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1020
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:34:46 BST 2022
epoch 1661024086

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:43:25 BST 2022
epoch 1661024605

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffd625a8000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ff21a6a000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ff216a2000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ff211b0000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ff20e86000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ff20ba5000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ff20944000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ff219f1000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ff20564000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ff1ee08000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ff1ea38000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ff1e797000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ff1e66c000)
libm.so.6 => /lib64/libm.so.6 (0x000014ff1e2ea000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ff1e0b3000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ff1de9b000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ff1dc7b000)
libc.so.6 => /lib64/libc.so.6 (0x000014ff1d8b6000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014ff1d6b2000)
/lib64/ld-linux-x86-64.so.2 (0x000014ff218ba000)
librt.so.1 => /lib64/librt.so.1 (0x000014ff1d4aa000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ff21925000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ff21920000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ff1d39e000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ff1d194000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014ff1cf90000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146a80000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.623478 s : Grid Layout
Grid : Message : 1.623482 s : Global lattice size : 48 48 48 96
Grid : Message : 1.623486 s : OpenMP threads : 4
Grid : Message : 1.623488 s : MPI tasks : 2 2 2 8
Grid : Message : 1.637678 s : Making s innermost grids
Grid : Message : 1.654638 s : Initialising 4d RNG
Grid : Message : 1.670417 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.670443 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.165386 s : Initialising 5d RNG
Grid : Message : 2.399472 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.399504 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.787095 s : Initialised RNGs
Grid : Message : 8.568006 s : Drawing gauge field
Grid : Message : 8.661012 s : Random gauge initialised
Grid : Message : 8.665024 s : Setting up Cshift based reference
Grid : Message : 13.760660 s : *****************************************************************
Grid : Message : 13.760685 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.760687 s : *****************************************************************
Grid : Message : 13.760690 s : *****************************************************************
Grid : Message : 13.760691 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.760692 s : * Vectorising space-time by 8
Grid : Message : 13.760694 s : * VComplexF size is 64 B
Grid : Message : 13.760696 s : * SINGLE precision
Grid : Message : 13.760697 s : * Using Overlapped Comms/Compute
Grid : Message : 13.760698 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.760700 s : *****************************************************************
Grid : Message : 14.326353 s : Called warmup
Grid : Message : 102.469231 s : Called Dw 30000 times in 8.81428e+07 us
Grid : Message : 102.469296 s : mflop/s = 7.63173e+07
Grid : Message : 102.469299 s : mflop/s per rank = 1.19246e+06
Grid : Message : 102.469307 s : mflop/s per node = 4.76983e+06
Grid : Message : 102.469310 s : RF GiB/s (base 2) = 155075
Grid : Message : 102.469313 s : mem GiB/s (base 2) = 96921.9
Grid : Message : 102.469886 s : norm diff 1.05775e-13
Grid : Message : 102.480527 s : #### Dhop calls report
Grid : Message : 102.480534 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 102.480538 s : WilsonFermion5D TotalTime /Calls : 1470.47 us
Grid : Message : 102.480540 s : WilsonFermion5D CommTime /Calls : 1029.89 us
Grid : Message : 102.480542 s : WilsonFermion5D FaceTime /Calls : 217.938 us
Grid : Message : 102.480544 s : WilsonFermion5D ComputeTime1/Calls : 3.09645 us
Grid : Message : 102.480546 s : WilsonFermion5D ComputeTime2/Calls : 235.402 us
Grid : Message : 102.480575 s : Average mflops/s per call : 3.61099e+10
Grid : Message : 102.480579 s : Average mflops/s per call per rank : 5.64217e+08
Grid : Message : 102.480581 s : Average mflops/s per call per node : 2.25687e+09
Grid : Message : 102.480583 s : Average mflops/s per call (full) : 7.76299e+07
Grid : Message : 102.480587 s : Average mflops/s per call per rank (full): 1.21297e+06
Grid : Message : 102.480590 s : Average mflops/s per call per node (full): 4.85187e+06
Grid : Message : 102.480593 s : WilsonFermion5D Stencil
Grid : Message : 102.480596 s : WilsonFermion5D StencilEven
Grid : Message : 102.480598 s : WilsonFermion5D StencilOdd
Grid : Message : 102.480600 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 102.480603 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 102.480605 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 111.202302 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 111.202331 s : Called DwDag
Grid : Message : 111.202332 s : norm dag result 12.0422
Grid : Message : 111.204652 s : norm dag ref 12.0422
Grid : Message : 111.207748 s : norm dag diff 7.28899e-14
Grid : Message : 111.218376 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 111.273653 s : src_e0.5
Grid : Message : 111.352934 s : src_o0.5
Grid : Message : 111.369965 s : *********************************************************
Grid : Message : 111.369970 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 111.369974 s : * Vectorising space-time by 8
Grid : Message : 111.369976 s : * SINGLE precision
Grid : Message : 111.369977 s : * Using Overlapped Comms/Compute
Grid : Message : 111.369981 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 111.369983 s : *********************************************************
Grid : Message : 158.806725 s : Deo mflop/s = 7.09164e+07
Grid : Message : 158.806755 s : Deo mflop/s per rank 1.10807e+06
Grid : Message : 158.806757 s : Deo mflop/s per node 4.43227e+06
Grid : Message : 158.806760 s : #### Dhop calls report
Grid : Message : 158.806762 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 158.806764 s : WilsonFermion5D TotalTime /Calls : 1581.06 us
Grid : Message : 158.806766 s : WilsonFermion5D CommTime /Calls : 1077.77 us
Grid : Message : 158.806768 s : WilsonFermion5D FaceTime /Calls : 286.721 us
Grid : Message : 158.806770 s : WilsonFermion5D ComputeTime1/Calls : 4.98297 us
Grid : Message : 158.806772 s : WilsonFermion5D ComputeTime2/Calls : 240.035 us
Grid : Message : 158.806792 s : Average mflops/s per call : 2.0753e+10
Grid : Message : 158.806796 s : Average mflops/s per call per rank : 3.24266e+08
Grid : Message : 158.806798 s : Average mflops/s per call per node : 1.29706e+09
Grid : Message : 158.806800 s : Average mflops/s per call (full) : 7.21996e+07
Grid : Message : 158.806804 s : Average mflops/s per call per rank (full): 1.12812e+06
Grid : Message : 158.806807 s : Average mflops/s per call per node (full): 4.51247e+06
Grid : Message : 158.806809 s : WilsonFermion5D Stencil
Grid : Message : 158.806810 s : WilsonFermion5D StencilEven
Grid : Message : 158.806812 s : WilsonFermion5D StencilOdd
Grid : Message : 158.806814 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 158.806816 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 158.806818 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 158.823821 s : r_e6.02106
Grid : Message : 158.827207 s : r_o6.0211
Grid : Message : 158.828617 s : res12.0422
Grid : Message : 158.938772 s : norm diff 0
Grid : Message : 159.724700 s : norm diff even 0
Grid : Message : 159.148761 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1035
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1035
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:40:36 BST 2022
epoch 1661024436

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:49:15 BST 2022
epoch 1661024955

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe2b5fb000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470cbce5000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470cb91d000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470cb42b000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470cb101000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470cae20000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470cabbf000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470cbc6c000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470ca7df000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470c9083000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470c8cb3000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470c8a12000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470c88e7000)
libm.so.6 => /lib64/libm.so.6 (0x00001470c8565000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470c832e000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470c8116000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470c7ef6000)
libc.so.6 => /lib64/libc.so.6 (0x00001470c7b31000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001470c792d000)
/lib64/ld-linux-x86-64.so.2 (0x00001470cbb35000)
librt.so.1 => /lib64/librt.so.1 (0x00001470c7725000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470cbba0000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470cbb9b000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470c7619000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470c740f000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001470c720b000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f600000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.720184 s : Grid Layout
Grid : Message : 1.720188 s : Global lattice size : 48 48 48 96
Grid : Message : 1.720196 s : OpenMP threads : 4
Grid : Message : 1.720199 s : MPI tasks : 2 2 2 8
Grid : Message : 1.735275 s : Making s innermost grids
Grid : Message : 1.752323 s : Initialising 4d RNG
Grid : Message : 1.768478 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.768504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.201838 s : Initialising 5d RNG
Grid : Message : 2.438683 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.438714 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.906459 s : Initialised RNGs
Grid : Message : 8.718015 s : Drawing gauge field
Grid : Message : 8.851801 s : Random gauge initialised
Grid : Message : 8.862438 s : Setting up Cshift based reference
Grid : Message : 13.896599 s : *****************************************************************
Grid : Message : 13.896621 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.896622 s : *****************************************************************
Grid : Message : 13.896623 s : *****************************************************************
Grid : Message : 13.896624 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.896625 s : * Vectorising space-time by 8
Grid : Message : 13.896626 s : * VComplexF size is 64 B
Grid : Message : 13.896627 s : * SINGLE precision
Grid : Message : 13.896628 s : * Using Overlapped Comms/Compute
Grid : Message : 13.896629 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.896630 s : *****************************************************************
Grid : Message : 14.428387 s : Called warmup
Grid : Message : 101.915473 s : Called Dw 30000 times in 8.74869e+07 us
Grid : Message : 101.915527 s : mflop/s = 7.68895e+07
Grid : Message : 101.915529 s : mflop/s per rank = 1.2014e+06
Grid : Message : 101.915531 s : mflop/s per node = 4.80559e+06
Grid : Message : 101.915533 s : RF GiB/s (base 2) = 156238
Grid : Message : 101.915535 s : mem GiB/s (base 2) = 97648.5
Grid : Message : 101.916107 s : norm diff 1.05775e-13
Grid : Message : 101.926218 s : #### Dhop calls report
Grid : Message : 101.926225 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 101.926228 s : WilsonFermion5D TotalTime /Calls : 1459.21 us
Grid : Message : 101.926230 s : WilsonFermion5D CommTime /Calls : 1016.78 us
Grid : Message : 101.926232 s : WilsonFermion5D FaceTime /Calls : 219.506 us
Grid : Message : 101.926234 s : WilsonFermion5D ComputeTime1/Calls : 2.78512 us
Grid : Message : 101.926236 s : WilsonFermion5D ComputeTime2/Calls : 235.25 us
Grid : Message : 101.926330 s : Average mflops/s per call : 3.60206e+10
Grid : Message : 101.926334 s : Average mflops/s per call per rank : 5.62822e+08
Grid : Message : 101.926336 s : Average mflops/s per call per node : 2.25129e+09
Grid : Message : 101.926338 s : Average mflops/s per call (full) : 7.82287e+07
Grid : Message : 101.926340 s : Average mflops/s per call per rank (full): 1.22232e+06
Grid : Message : 101.926342 s : Average mflops/s per call per node (full): 4.88929e+06
Grid : Message : 101.926344 s : WilsonFermion5D Stencil
Grid : Message : 101.926345 s : WilsonFermion5D StencilEven
Grid : Message : 101.926346 s : WilsonFermion5D StencilOdd
Grid : Message : 101.926347 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 101.926348 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 101.926349 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 110.616405 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 110.616430 s : Called DwDag
Grid : Message : 110.616431 s : norm dag result 12.0422
Grid : Message : 110.621134 s : norm dag ref 12.0422
Grid : Message : 110.624323 s : norm dag diff 7.28899e-14
Grid : Message : 110.637247 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 110.698940 s : src_e0.5
Grid : Message : 110.766761 s : src_o0.5
Grid : Message : 110.783307 s : *********************************************************
Grid : Message : 110.783311 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 110.783313 s : * Vectorising space-time by 8
Grid : Message : 110.783315 s : * SINGLE precision
Grid : Message : 110.783316 s : * Using Overlapped Comms/Compute
Grid : Message : 110.783317 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 110.783318 s : *********************************************************
Grid : Message : 157.764942 s : Deo mflop/s = 7.16075e+07
Grid : Message : 157.764976 s : Deo mflop/s per rank 1.11887e+06
Grid : Message : 157.764978 s : Deo mflop/s per node 4.47547e+06
Grid : Message : 157.764981 s : #### Dhop calls report
Grid : Message : 157.764983 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 157.764985 s : WilsonFermion5D TotalTime /Calls : 1565.89 us
Grid : Message : 157.764987 s : WilsonFermion5D CommTime /Calls : 1058.27 us
Grid : Message : 157.764989 s : WilsonFermion5D FaceTime /Calls : 292.487 us
Grid : Message : 157.764991 s : WilsonFermion5D ComputeTime1/Calls : 4.72584 us
Grid : Message : 157.764993 s : WilsonFermion5D ComputeTime2/Calls : 239.678 us
Grid : Message : 157.765020 s : Average mflops/s per call : 2.07994e+10
Grid : Message : 157.765024 s : Average mflops/s per call per rank : 3.2499e+08
Grid : Message : 157.765027 s : Average mflops/s per call per node : 1.29996e+09
Grid : Message : 157.765031 s : Average mflops/s per call (full) : 7.28994e+07
Grid : Message : 157.765035 s : Average mflops/s per call per rank (full): 1.13905e+06
Grid : Message : 157.765039 s : Average mflops/s per call per node (full): 4.55621e+06
Grid : Message : 157.765042 s : WilsonFermion5D Stencil
Grid : Message : 157.765044 s : WilsonFermion5D StencilEven
Grid : Message : 157.765046 s : WilsonFermion5D StencilOdd
Grid : Message : 157.765049 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 157.765051 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 157.765053 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 157.783731 s : r_e6.02106
Grid : Message : 157.786036 s : r_o6.0211
Grid : Message : 157.787470 s : res12.0422
Grid : Message : 157.905573 s : norm diff 0
Grid : Message : 158.337590 s : norm diff even 0
Grid : Message : 158.959010 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1050
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1050
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:46:27 BST 2022
epoch 1661024788

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:55:03 BST 2022
epoch 1661025303

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffd9b1d1000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a2805dc000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a280214000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a27fd22000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a27f9f8000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a27f717000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a27f4b6000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a280563000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a27f0d6000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a27d97a000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a27d5aa000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a27d309000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a27d1de000)
libm.so.6 => /lib64/libm.so.6 (0x000014a27ce5c000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a27cc25000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a27ca0d000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a27c7ed000)
libc.so.6 => /lib64/libc.so.6 (0x000014a27c428000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014a27c224000)
/lib64/ld-linux-x86-64.so.2 (0x000014a28042c000)
librt.so.1 => /lib64/librt.so.1 (0x000014a27c01c000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a280497000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a280492000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a27bf10000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a27bd06000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014a27bb02000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150120000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.428183 s : Grid Layout
Grid : Message : 1.428187 s : Global lattice size : 48 48 48 96
Grid : Message : 1.428193 s : OpenMP threads : 4
Grid : Message : 1.428196 s : MPI tasks : 2 2 2 8
Grid : Message : 1.443217 s : Making s innermost grids
Grid : Message : 1.455165 s : Initialising 4d RNG
Grid : Message : 1.471981 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.472007 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.853366 s : Initialising 5d RNG
Grid : Message : 2.875960 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.876470 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.305707 s : Initialised RNGs
Grid : Message : 8.397843 s : Drawing gauge field
Grid : Message : 8.484443 s : Random gauge initialised
Grid : Message : 8.488387 s : Setting up Cshift based reference
Grid : Message : 13.563627 s : *****************************************************************
Grid : Message : 13.563653 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.563655 s : *****************************************************************
Grid : Message : 13.563658 s : *****************************************************************
Grid : Message : 13.563659 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.563660 s : * Vectorising space-time by 8
Grid : Message : 13.563663 s : * VComplexF size is 64 B
Grid : Message : 13.563665 s : * SINGLE precision
Grid : Message : 13.563667 s : * Using Overlapped Comms/Compute
Grid : Message : 13.563668 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.563669 s : *****************************************************************
Grid : Message : 14.958310 s : Called warmup
Grid : Message : 101.445133 s : Called Dw 30000 times in 8.73489e+07 us
Grid : Message : 101.445198 s : mflop/s = 7.7011e+07
Grid : Message : 101.445200 s : mflop/s per rank = 1.2033e+06
Grid : Message : 101.445202 s : mflop/s per node = 4.81319e+06
Grid : Message : 101.445204 s : RF GiB/s (base 2) = 156485
Grid : Message : 101.445206 s : mem GiB/s (base 2) = 97802.9
Grid : Message : 101.445777 s : norm diff 1.05775e-13
Grid : Message : 101.455931 s : #### Dhop calls report
Grid : Message : 101.455939 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 101.455943 s : WilsonFermion5D TotalTime /Calls : 1457.12 us
Grid : Message : 101.455945 s : WilsonFermion5D CommTime /Calls : 1014.92 us
Grid : Message : 101.455947 s : WilsonFermion5D FaceTime /Calls : 219.441 us
Grid : Message : 101.455949 s : WilsonFermion5D ComputeTime1/Calls : 2.84344 us
Grid : Message : 101.455951 s : WilsonFermion5D ComputeTime2/Calls : 235.367 us
Grid : Message : 101.455978 s : Average mflops/s per call : 3.61947e+10
Grid : Message : 101.455982 s : Average mflops/s per call per rank : 5.65543e+08
Grid : Message : 101.455984 s : Average mflops/s per call per node : 2.26217e+09
Grid : Message : 101.455986 s : Average mflops/s per call (full) : 7.83407e+07
Grid : Message : 101.455990 s : Average mflops/s per call per rank (full): 1.22407e+06
Grid : Message : 101.455992 s : Average mflops/s per call per node (full): 4.8963e+06
Grid : Message : 101.455994 s : WilsonFermion5D Stencil
Grid : Message : 101.455995 s : WilsonFermion5D StencilEven
Grid : Message : 101.455999 s : WilsonFermion5D StencilOdd
Grid : Message : 101.456001 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 101.456002 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 101.456004 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 110.188024 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 110.188051 s : Called DwDag
Grid : Message : 110.188052 s : norm dag result 12.0422
Grid : Message : 110.200211 s : norm dag ref 12.0422
Grid : Message : 110.203215 s : norm dag diff 7.28899e-14
Grid : Message : 110.213199 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 110.281787 s : src_e0.5
Grid : Message : 110.353808 s : src_o0.5
Grid : Message : 110.370985 s : *********************************************************
Grid : Message : 110.370991 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 110.370992 s : * Vectorising space-time by 8
Grid : Message : 110.370995 s : * SINGLE precision
Grid : Message : 110.370997 s : * Using Overlapped Comms/Compute
Grid : Message : 110.370998 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 110.371000 s : *********************************************************
Grid : Message : 157.314519 s : Deo mflop/s = 7.16631e+07
Grid : Message : 157.314545 s : Deo mflop/s per rank 1.11974e+06
Grid : Message : 157.314547 s : Deo mflop/s per node 4.47894e+06
Grid : Message : 157.314550 s : #### Dhop calls report
Grid : Message : 157.314552 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 157.314554 s : WilsonFermion5D TotalTime /Calls : 1564.64 us
Grid : Message : 157.314556 s : WilsonFermion5D CommTime /Calls : 1060.37 us
Grid : Message : 157.314558 s : WilsonFermion5D FaceTime /Calls : 287.98 us
Grid : Message : 157.314560 s : WilsonFermion5D ComputeTime1/Calls : 4.91794 us
Grid : Message : 157.314562 s : WilsonFermion5D ComputeTime2/Calls : 239.551 us
Grid : Message : 157.314587 s : Average mflops/s per call : 2.07265e+10
Grid : Message : 157.314591 s : Average mflops/s per call per rank : 3.23852e+08
Grid : Message : 157.314593 s : Average mflops/s per call per node : 1.29541e+09
Grid : Message : 157.314596 s : Average mflops/s per call (full) : 7.29577e+07
Grid : Message : 157.314600 s : Average mflops/s per call per rank (full): 1.13996e+06
Grid : Message : 157.314602 s : Average mflops/s per call per node (full): 4.55985e+06
Grid : Message : 157.314605 s : WilsonFermion5D Stencil
Grid : Message : 157.314606 s : WilsonFermion5D StencilEven
Grid : Message : 157.314608 s : WilsonFermion5D StencilOdd
Grid : Message : 157.314610 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 157.314613 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 157.314614 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 157.334523 s : r_e6.02106
Grid : Message : 157.336050 s : r_o6.0211
Grid : Message : 157.337424 s : res12.0422
Grid : Message : 157.450236 s : norm diff 0
Grid : Message : 157.586163 s : norm diff even 0
Grid : Message : 157.657558 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1065
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1065
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:52:16 BST 2022
epoch 1661025136

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:00:52 BST 2022
epoch 1661025652

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffceffcb000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c73048f000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c7300c7000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c72fbd5000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c72f8ab000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c72f5ca000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c72f369000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c730416000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c72ef89000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c72d82d000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c72d45d000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c72d1bc000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c72d091000)
libm.so.6 => /lib64/libm.so.6 (0x000014c72cd0f000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c72cad8000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c72c8c0000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c72c6a0000)
libc.so.6 => /lib64/libc.so.6 (0x000014c72c2db000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014c72c0d7000)
/lib64/ld-linux-x86-64.so.2 (0x000014c7302df000)
librt.so.1 => /lib64/librt.so.1 (0x000014c72becf000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c73034a000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c730345000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c72bdc3000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c72bbb9000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014c72b9b5000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1548a0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.498999 s : Grid Layout
Grid : Message : 1.499003 s : Global lattice size : 48 48 48 96
Grid : Message : 1.499009 s : OpenMP threads : 4
Grid : Message : 1.499010 s : MPI tasks : 2 2 2 8
Grid : Message : 1.516697 s : Making s innermost grids
Grid : Message : 1.528026 s : Initialising 4d RNG
Grid : Message : 1.543296 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.543322 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.803104 s : Initialising 5d RNG
Grid : Message : 2.280210 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.280810 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.463560 s : Initialised RNGs
Grid : Message : 8.316566 s : Drawing gauge field
Grid : Message : 8.441882 s : Random gauge initialised
Grid : Message : 8.454498 s : Setting up Cshift based reference
Grid : Message : 13.615874 s : *****************************************************************
Grid : Message : 13.615901 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.615903 s : *****************************************************************
Grid : Message : 13.615904 s : *****************************************************************
Grid : Message : 13.615905 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.615906 s : * Vectorising space-time by 8
Grid : Message : 13.615910 s : * VComplexF size is 64 B
Grid : Message : 13.615912 s : * SINGLE precision
Grid : Message : 13.615914 s : * Using Overlapped Comms/Compute
Grid : Message : 13.615916 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.615918 s : *****************************************************************
Grid : Message : 14.175758 s : Called warmup
Grid : Message : 100.948265 s : Called Dw 30000 times in 8.67724e+07 us
Grid : Message : 100.948328 s : mflop/s = 7.75226e+07
Grid : Message : 100.948330 s : mflop/s per rank = 1.21129e+06
Grid : Message : 100.948332 s : mflop/s per node = 4.84516e+06
Grid : Message : 100.948334 s : RF GiB/s (base 2) = 157524
Grid : Message : 100.948336 s : mem GiB/s (base 2) = 98452.5
Grid : Message : 100.948912 s : norm diff 1.05775e-13
Grid : Message : 100.958922 s : #### Dhop calls report
Grid : Message : 100.958930 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 100.958934 s : WilsonFermion5D TotalTime /Calls : 1447.35 us
Grid : Message : 100.958936 s : WilsonFermion5D CommTime /Calls : 1006.18 us
Grid : Message : 100.958938 s : WilsonFermion5D FaceTime /Calls : 218.625 us
Grid : Message : 100.958940 s : WilsonFermion5D ComputeTime1/Calls : 2.6472 us
Grid : Message : 100.958942 s : WilsonFermion5D ComputeTime2/Calls : 235.108 us
Grid : Message : 100.958970 s : Average mflops/s per call : 3.6261e+10
Grid : Message : 100.958974 s : Average mflops/s per call per rank : 5.66578e+08
Grid : Message : 100.958976 s : Average mflops/s per call per node : 2.26631e+09
Grid : Message : 100.958978 s : Average mflops/s per call (full) : 7.88698e+07
Grid : Message : 100.958981 s : Average mflops/s per call per rank (full): 1.23234e+06
Grid : Message : 100.958983 s : Average mflops/s per call per node (full): 4.92936e+06
Grid : Message : 100.958986 s : WilsonFermion5D Stencil
Grid : Message : 100.958987 s : WilsonFermion5D StencilEven
Grid : Message : 100.958988 s : WilsonFermion5D StencilOdd
Grid : Message : 100.958991 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 100.958992 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 100.958995 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 109.635912 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 109.635940 s : Called DwDag
Grid : Message : 109.635941 s : norm dag result 12.0422
Grid : Message : 109.641498 s : norm dag ref 12.0422
Grid : Message : 109.644623 s : norm dag diff 7.28899e-14
Grid : Message : 109.654599 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 109.718075 s : src_e0.5
Grid : Message : 109.790285 s : src_o0.5
Grid : Message : 109.807211 s : *********************************************************
Grid : Message : 109.807217 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 109.807219 s : * Vectorising space-time by 8
Grid : Message : 109.807221 s : * SINGLE precision
Grid : Message : 109.807224 s : * Using Overlapped Comms/Compute
Grid : Message : 109.807225 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 109.807226 s : *********************************************************
Grid : Message : 156.357075 s : Deo mflop/s = 7.22704e+07
Grid : Message : 156.357109 s : Deo mflop/s per rank 1.12923e+06
Grid : Message : 156.357111 s : Deo mflop/s per node 4.5169e+06
Grid : Message : 156.357114 s : #### Dhop calls report
Grid : Message : 156.357116 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 156.357118 s : WilsonFermion5D TotalTime /Calls : 1551.51 us
Grid : Message : 156.357120 s : WilsonFermion5D CommTime /Calls : 1049.38 us
Grid : Message : 156.357122 s : WilsonFermion5D FaceTime /Calls : 285.792 us
Grid : Message : 156.357124 s : WilsonFermion5D ComputeTime1/Calls : 4.81357 us
Grid : Message : 156.357126 s : WilsonFermion5D ComputeTime2/Calls : 239.16 us
Grid : Message : 156.357146 s : Average mflops/s per call : 2.07719e+10
Grid : Message : 156.357150 s : Average mflops/s per call per rank : 3.24561e+08
Grid : Message : 156.357152 s : Average mflops/s per call per node : 1.29824e+09
Grid : Message : 156.357154 s : Average mflops/s per call (full) : 7.35747e+07
Grid : Message : 156.357158 s : Average mflops/s per call per rank (full): 1.1496e+06
Grid : Message : 156.357161 s : Average mflops/s per call per node (full): 4.59842e+06
Grid : Message : 156.357163 s : WilsonFermion5D Stencil
Grid : Message : 156.357165 s : WilsonFermion5D StencilEven
Grid : Message : 156.357166 s : WilsonFermion5D StencilOdd
Grid : Message : 156.357168 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 156.357175 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 156.357176 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 156.375718 s : r_e6.02106
Grid : Message : 156.378883 s : r_o6.0211
Grid : Message : 156.380335 s : res12.0422
Grid : Message : 156.489162 s : norm diff 0
Grid : Message : 156.617774 s : norm diff even 0
Grid : Message : 156.694536 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1080
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1080
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:58:06 BST 2022
epoch 1661025486

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:06:38 BST 2022
epoch 1661025998

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc219f0000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014aa89605000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014aa8923d000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014aa88d4b000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014aa88a21000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014aa88740000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014aa884df000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014aa8958c000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014aa880ff000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014aa869a3000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014aa865d3000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014aa86332000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014aa86207000)
libm.so.6 => /lib64/libm.so.6 (0x000014aa85e85000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014aa85c4e000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014aa85a36000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014aa85816000)
libc.so.6 => /lib64/libc.so.6 (0x000014aa85451000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014aa8524d000)
/lib64/ld-linux-x86-64.so.2 (0x000014aa89455000)
librt.so.1 => /lib64/librt.so.1 (0x000014aa85045000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014aa894c0000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014aa894bb000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014aa84f39000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014aa84d2f000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014aa84b2b000)

View File

@ -0,0 +1,286 @@
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d00000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.412895 s : Grid Layout
Grid : Message : 1.412899 s : Global lattice size : 48 48 48 96
Grid : Message : 1.412905 s : OpenMP threads : 4
Grid : Message : 1.412909 s : MPI tasks : 2 2 2 8
Grid : Message : 1.428319 s : Making s innermost grids
Grid : Message : 1.445373 s : Initialising 4d RNG
Grid : Message : 1.461658 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.461680 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.902912 s : Initialising 5d RNG
Grid : Message : 2.141255 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.141291 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.353326 s : Initialised RNGs
Grid : Message : 8.518633 s : Drawing gauge field
Grid : Message : 8.626652 s : Random gauge initialised
Grid : Message : 8.630634 s : Setting up Cshift based reference
Grid : Message : 13.722925 s : *****************************************************************
Grid : Message : 13.722949 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.722950 s : *****************************************************************
Grid : Message : 13.722951 s : *****************************************************************
Grid : Message : 13.722952 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.722953 s : * Vectorising space-time by 8
Grid : Message : 13.722954 s : * VComplexF size is 64 B
Grid : Message : 13.722955 s : * SINGLE precision
Grid : Message : 13.722956 s : * Using Overlapped Comms/Compute
Grid : Message : 13.722957 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.722958 s : *****************************************************************
Grid : Message : 14.254628 s : Called warmup
Grid : Message : 100.327406 s : Called Dw 30000 times in 8.60725e+07 us
Grid : Message : 100.327470 s : mflop/s = 7.8153e+07
Grid : Message : 100.327472 s : mflop/s per rank = 1.22114e+06
Grid : Message : 100.327474 s : mflop/s per node = 4.88456e+06
Grid : Message : 100.327476 s : RF GiB/s (base 2) = 158805
Grid : Message : 100.327478 s : mem GiB/s (base 2) = 99253.2
Grid : Message : 100.328051 s : norm diff 1.05775e-13
Grid : Message : 100.337927 s : #### Dhop calls report
Grid : Message : 100.337935 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 100.337943 s : WilsonFermion5D TotalTime /Calls : 1435.69 us
Grid : Message : 100.337946 s : WilsonFermion5D CommTime /Calls : 996.547 us
Grid : Message : 100.337949 s : WilsonFermion5D FaceTime /Calls : 217.079 us
Grid : Message : 100.337953 s : WilsonFermion5D ComputeTime1/Calls : 2.78067 us
Grid : Message : 100.337955 s : WilsonFermion5D ComputeTime2/Calls : 234.472 us
Grid : Message : 100.337971 s : Average mflops/s per call : 3.63872e+10
Grid : Message : 100.337974 s : Average mflops/s per call per rank : 5.68549e+08
Grid : Message : 100.337976 s : Average mflops/s per call per node : 2.2742e+09
Grid : Message : 100.337980 s : Average mflops/s per call (full) : 7.95104e+07
Grid : Message : 100.337982 s : Average mflops/s per call per rank (full): 1.24235e+06
Grid : Message : 100.337986 s : Average mflops/s per call per node (full): 4.9694e+06
Grid : Message : 100.337988 s : WilsonFermion5D Stencil
Grid : Message : 100.337990 s : WilsonFermion5D StencilEven
Grid : Message : 100.337992 s : WilsonFermion5D StencilOdd
Grid : Message : 100.337995 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 100.337998 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 100.338000 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 109.354730 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 109.355200 s : Called DwDag
Grid : Message : 109.355210 s : norm dag result 12.0422
Grid : Message : 109.404420 s : norm dag ref 12.0422
Grid : Message : 109.435430 s : norm dag diff 7.28899e-14
Grid : Message : 109.565940 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 109.123204 s : src_e0.5
Grid : Message : 109.194082 s : src_o0.5
Grid : Message : 109.211743 s : *********************************************************
Grid : Message : 109.211749 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 109.211751 s : * Vectorising space-time by 8
Grid : Message : 109.211754 s : * SINGLE precision
Grid : Message : 109.211756 s : * Using Overlapped Comms/Compute
Grid : Message : 109.211759 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 109.211761 s : *********************************************************
Grid : Message : 155.351395 s : Deo mflop/s = 7.29132e+07
Grid : Message : 155.351424 s : Deo mflop/s per rank 1.13927e+06
Grid : Message : 155.351427 s : Deo mflop/s per node 4.55708e+06
Grid : Message : 155.351433 s : #### Dhop calls report
Grid : Message : 155.351436 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 155.351440 s : WilsonFermion5D TotalTime /Calls : 1537.8 us
Grid : Message : 155.351445 s : WilsonFermion5D CommTime /Calls : 1037.77 us
Grid : Message : 155.351449 s : WilsonFermion5D FaceTime /Calls : 285.044 us
Grid : Message : 155.351453 s : WilsonFermion5D ComputeTime1/Calls : 4.8771 us
Grid : Message : 155.351457 s : WilsonFermion5D ComputeTime2/Calls : 237.861 us
Grid : Message : 155.351481 s : Average mflops/s per call : 2.07287e+10
Grid : Message : 155.351485 s : Average mflops/s per call per rank : 3.23886e+08
Grid : Message : 155.351488 s : Average mflops/s per call per node : 1.29554e+09
Grid : Message : 155.351492 s : Average mflops/s per call (full) : 7.42306e+07
Grid : Message : 155.351496 s : Average mflops/s per call per rank (full): 1.15985e+06
Grid : Message : 155.351500 s : Average mflops/s per call per node (full): 4.63942e+06
Grid : Message : 155.351504 s : WilsonFermion5D Stencil
Grid : Message : 155.351506 s : WilsonFermion5D StencilEven
Grid : Message : 155.351508 s : WilsonFermion5D StencilOdd
Grid : Message : 155.351511 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 155.351513 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 155.351515 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 155.370290 s : r_e6.02106
Grid : Message : 155.372244 s : r_o6.0211
Grid : Message : 155.373660 s : res12.0422
Grid : Message : 155.495172 s : norm diff 0
Grid : Message : 155.622362 s : norm diff even 0
Grid : Message : 155.695812 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1095
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1095
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:03:53 BST 2022
epoch 1661025833

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:12:23 BST 2022
epoch 1661026343

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffdef5db000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000152bce209000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000152bcde41000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000152bcd94f000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000152bcd625000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000152bcd344000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000152bcd0e3000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000152bce190000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000152bccd03000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152bcb5a7000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000152bcb1d7000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152bcaf36000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000152bcae0b000)
libm.so.6 => /lib64/libm.so.6 (0x0000152bcaa89000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152bca852000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152bca63a000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152bca41a000)
libc.so.6 => /lib64/libc.so.6 (0x0000152bca055000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000152bc9e51000)
/lib64/ld-linux-x86-64.so.2 (0x0000152bce059000)
librt.so.1 => /lib64/librt.so.1 (0x0000152bc9c49000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000152bce0c4000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000152bce0bf000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000152bc9b3d000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152bc9933000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000152bc972f000)

View File

@ -0,0 +1,286 @@
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147320000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.574553 s : Grid Layout
Grid : Message : 1.574555 s : Global lattice size : 48 48 48 96
Grid : Message : 1.574559 s : OpenMP threads : 4
Grid : Message : 1.574561 s : MPI tasks : 2 2 2 8
Grid : Message : 1.590560 s : Making s innermost grids
Grid : Message : 1.602336 s : Initialising 4d RNG
Grid : Message : 1.619266 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.619291 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.883640 s : Initialising 5d RNG
Grid : Message : 2.117383 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.117419 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.594282 s : Initialised RNGs
Grid : Message : 8.809615 s : Drawing gauge field
Grid : Message : 8.954788 s : Random gauge initialised
Grid : Message : 8.965668 s : Setting up Cshift based reference
Grid : Message : 13.965128 s : *****************************************************************
Grid : Message : 13.965152 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.965153 s : *****************************************************************
Grid : Message : 13.965154 s : *****************************************************************
Grid : Message : 13.965155 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.965156 s : * Vectorising space-time by 8
Grid : Message : 13.965157 s : * VComplexF size is 64 B
Grid : Message : 13.965159 s : * SINGLE precision
Grid : Message : 13.965160 s : * Using Overlapped Comms/Compute
Grid : Message : 13.965161 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.965162 s : *****************************************************************
Grid : Message : 14.515202 s : Called warmup
Grid : Message : 99.730150 s : Called Dw 30000 times in 8.52149e+07 us
Grid : Message : 99.730204 s : mflop/s = 7.89395e+07
Grid : Message : 99.730206 s : mflop/s per rank = 1.23343e+06
Grid : Message : 99.730208 s : mflop/s per node = 4.93372e+06
Grid : Message : 99.730210 s : RF GiB/s (base 2) = 160403
Grid : Message : 99.730212 s : mem GiB/s (base 2) = 100252
Grid : Message : 99.730784 s : norm diff 1.05775e-13
Grid : Message : 99.740621 s : #### Dhop calls report
Grid : Message : 99.740628 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 99.740631 s : WilsonFermion5D TotalTime /Calls : 1421.72 us
Grid : Message : 99.740633 s : WilsonFermion5D CommTime /Calls : 984.801 us
Grid : Message : 99.740635 s : WilsonFermion5D FaceTime /Calls : 215.72 us
Grid : Message : 99.740637 s : WilsonFermion5D ComputeTime1/Calls : 2.65594 us
Grid : Message : 99.740639 s : WilsonFermion5D ComputeTime2/Calls : 233.727 us
Grid : Message : 99.740655 s : Average mflops/s per call : 3.59268e+10
Grid : Message : 99.740658 s : Average mflops/s per call per rank : 5.61356e+08
Grid : Message : 99.740660 s : Average mflops/s per call per node : 2.24542e+09
Grid : Message : 99.740662 s : Average mflops/s per call (full) : 8.02916e+07
Grid : Message : 99.740665 s : Average mflops/s per call per rank (full): 1.25456e+06
Grid : Message : 99.740667 s : Average mflops/s per call per node (full): 5.01823e+06
Grid : Message : 99.740669 s : WilsonFermion5D Stencil
Grid : Message : 99.740670 s : WilsonFermion5D StencilEven
Grid : Message : 99.740672 s : WilsonFermion5D StencilOdd
Grid : Message : 99.740673 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 99.740675 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 99.740679 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 108.466783 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 108.466816 s : Called DwDag
Grid : Message : 108.466817 s : norm dag result 12.0422
Grid : Message : 108.470193 s : norm dag ref 12.0422
Grid : Message : 108.473428 s : norm dag diff 7.28899e-14
Grid : Message : 108.486838 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 108.550312 s : src_e0.5
Grid : Message : 108.623836 s : src_o0.5
Grid : Message : 108.640541 s : *********************************************************
Grid : Message : 108.640545 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 108.640546 s : * Vectorising space-time by 8
Grid : Message : 108.640548 s : * SINGLE precision
Grid : Message : 108.640553 s : * Using Overlapped Comms/Compute
Grid : Message : 108.640555 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 108.640556 s : *********************************************************
Grid : Message : 154.233908 s : Deo mflop/s = 7.37872e+07
Grid : Message : 154.233941 s : Deo mflop/s per rank 1.15293e+06
Grid : Message : 154.233943 s : Deo mflop/s per node 4.6117e+06
Grid : Message : 154.233946 s : #### Dhop calls report
Grid : Message : 154.233948 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 154.233950 s : WilsonFermion5D TotalTime /Calls : 1519.59 us
Grid : Message : 154.233952 s : WilsonFermion5D CommTime /Calls : 1019.64 us
Grid : Message : 154.233954 s : WilsonFermion5D FaceTime /Calls : 288.201 us
Grid : Message : 154.233956 s : WilsonFermion5D ComputeTime1/Calls : 4.91837 us
Grid : Message : 154.233958 s : WilsonFermion5D ComputeTime2/Calls : 236.348 us
Grid : Message : 154.233977 s : Average mflops/s per call : 2.07539e+10
Grid : Message : 154.233980 s : Average mflops/s per call per rank : 3.24279e+08
Grid : Message : 154.233982 s : Average mflops/s per call per node : 1.29712e+09
Grid : Message : 154.233984 s : Average mflops/s per call (full) : 7.51203e+07
Grid : Message : 154.233986 s : Average mflops/s per call per rank (full): 1.17375e+06
Grid : Message : 154.233988 s : Average mflops/s per call per node (full): 4.69502e+06
Grid : Message : 154.233991 s : WilsonFermion5D Stencil
Grid : Message : 154.233992 s : WilsonFermion5D StencilEven
Grid : Message : 154.233993 s : WilsonFermion5D StencilOdd
Grid : Message : 154.233994 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 154.233995 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 154.233996 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 154.253979 s : r_e6.02106
Grid : Message : 154.255883 s : r_o6.0211
Grid : Message : 154.257289 s : res12.0422
Grid : Message : 154.364123 s : norm diff 0
Grid : Message : 154.496590 s : norm diff even 0
Grid : Message : 154.572879 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1110
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1110
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:09:39 BST 2022
epoch 1661026179

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:18:10 BST 2022
epoch 1661026690

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe04b26000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ffbc78a000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ffbc3c2000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ffbbed0000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ffbbba6000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ffbb8c5000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ffbb664000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ffbc711000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ffbb284000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ffb9b28000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ffb9758000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ffb94b7000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ffb938c000)
libm.so.6 => /lib64/libm.so.6 (0x000014ffb900a000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ffb8dd3000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ffb8bbb000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ffb899b000)
libc.so.6 => /lib64/libc.so.6 (0x000014ffb85d6000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014ffb83d2000)
/lib64/ld-linux-x86-64.so.2 (0x000014ffbc5da000)
librt.so.1 => /lib64/librt.so.1 (0x000014ffb81ca000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ffbc645000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ffbc640000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ffb80be000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ffb7eb4000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014ffb7cb0000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146500000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.503072 s : Grid Layout
Grid : Message : 1.503076 s : Global lattice size : 48 48 48 96
Grid : Message : 1.503081 s : OpenMP threads : 4
Grid : Message : 1.503083 s : MPI tasks : 2 2 2 8
Grid : Message : 1.518479 s : Making s innermost grids
Grid : Message : 1.535611 s : Initialising 4d RNG
Grid : Message : 1.551229 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.551252 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.805667 s : Initialising 5d RNG
Grid : Message : 2.356490 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.357030 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.303785 s : Initialised RNGs
Grid : Message : 8.385261 s : Drawing gauge field
Grid : Message : 8.496485 s : Random gauge initialised
Grid : Message : 8.509783 s : Setting up Cshift based reference
Grid : Message : 13.609539 s : *****************************************************************
Grid : Message : 13.609564 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.609566 s : *****************************************************************
Grid : Message : 13.609568 s : *****************************************************************
Grid : Message : 13.609573 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.609575 s : * Vectorising space-time by 8
Grid : Message : 13.609577 s : * VComplexF size is 64 B
Grid : Message : 13.609579 s : * SINGLE precision
Grid : Message : 13.609582 s : * Using Overlapped Comms/Compute
Grid : Message : 13.609584 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.609586 s : *****************************************************************
Grid : Message : 14.155991 s : Called warmup
Grid : Message : 98.420612 s : Called Dw 30000 times in 8.42644e+07 us
Grid : Message : 98.420675 s : mflop/s = 7.983e+07
Grid : Message : 98.420677 s : mflop/s per rank = 1.24734e+06
Grid : Message : 98.420679 s : mflop/s per node = 4.98937e+06
Grid : Message : 98.420681 s : RF GiB/s (base 2) = 162213
Grid : Message : 98.420683 s : mem GiB/s (base 2) = 101383
Grid : Message : 98.421254 s : norm diff 1.05775e-13
Grid : Message : 98.431170 s : #### Dhop calls report
Grid : Message : 98.431178 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 98.431182 s : WilsonFermion5D TotalTime /Calls : 1405.63 us
Grid : Message : 98.431184 s : WilsonFermion5D CommTime /Calls : 961.451 us
Grid : Message : 98.431186 s : WilsonFermion5D FaceTime /Calls : 222.433 us
Grid : Message : 98.431188 s : WilsonFermion5D ComputeTime1/Calls : 2.80214 us
Grid : Message : 98.431190 s : WilsonFermion5D ComputeTime2/Calls : 234.1 us
Grid : Message : 98.431212 s : Average mflops/s per call : 3.60793e+10
Grid : Message : 98.431216 s : Average mflops/s per call per rank : 5.63738e+08
Grid : Message : 98.431218 s : Average mflops/s per call per node : 2.25495e+09
Grid : Message : 98.431220 s : Average mflops/s per call (full) : 8.12107e+07
Grid : Message : 98.431224 s : Average mflops/s per call per rank (full): 1.26892e+06
Grid : Message : 98.431226 s : Average mflops/s per call per node (full): 5.07567e+06
Grid : Message : 98.431229 s : WilsonFermion5D Stencil
Grid : Message : 98.431230 s : WilsonFermion5D StencilEven
Grid : Message : 98.431235 s : WilsonFermion5D StencilOdd
Grid : Message : 98.431239 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 98.431240 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 98.431241 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 107.161203 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 107.161230 s : Called DwDag
Grid : Message : 107.161231 s : norm dag result 12.0422
Grid : Message : 107.163717 s : norm dag ref 12.0422
Grid : Message : 107.166717 s : norm dag diff 7.28899e-14
Grid : Message : 107.181064 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 107.248613 s : src_e0.5
Grid : Message : 107.314227 s : src_o0.5
Grid : Message : 107.331787 s : *********************************************************
Grid : Message : 107.331790 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 107.331792 s : * Vectorising space-time by 8
Grid : Message : 107.331794 s : * SINGLE precision
Grid : Message : 107.331795 s : * Using Overlapped Comms/Compute
Grid : Message : 107.331796 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 107.331797 s : *********************************************************
Grid : Message : 152.337360 s : Deo mflop/s = 7.47496e+07
Grid : Message : 152.337387 s : Deo mflop/s per rank 1.16796e+06
Grid : Message : 152.337390 s : Deo mflop/s per node 4.67185e+06
Grid : Message : 152.337396 s : #### Dhop calls report
Grid : Message : 152.337399 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 152.337402 s : WilsonFermion5D TotalTime /Calls : 1500 us
Grid : Message : 152.337405 s : WilsonFermion5D CommTime /Calls : 1002.91 us
Grid : Message : 152.337408 s : WilsonFermion5D FaceTime /Calls : 282.963 us
Grid : Message : 152.337410 s : WilsonFermion5D ComputeTime1/Calls : 4.71911 us
Grid : Message : 152.337412 s : WilsonFermion5D ComputeTime2/Calls : 237.647 us
Grid : Message : 152.337435 s : Average mflops/s per call : 2.07759e+10
Grid : Message : 152.337439 s : Average mflops/s per call per rank : 3.24624e+08
Grid : Message : 152.337441 s : Average mflops/s per call per node : 1.29849e+09
Grid : Message : 152.337445 s : Average mflops/s per call (full) : 7.61013e+07
Grid : Message : 152.337448 s : Average mflops/s per call per rank (full): 1.18908e+06
Grid : Message : 152.337451 s : Average mflops/s per call per node (full): 4.75633e+06
Grid : Message : 152.337453 s : WilsonFermion5D Stencil
Grid : Message : 152.337456 s : WilsonFermion5D StencilEven
Grid : Message : 152.337457 s : WilsonFermion5D StencilOdd
Grid : Message : 152.337459 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 152.337462 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 152.337463 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 152.358219 s : r_e6.02106
Grid : Message : 152.359968 s : r_o6.0211
Grid : Message : 152.361373 s : res12.0422
Grid : Message : 152.467780 s : norm diff 0
Grid : Message : 152.609427 s : norm diff even 0
Grid : Message : 152.675745 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1125
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1125
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:15:27 BST 2022
epoch 1661026527

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:23:53 BST 2022
epoch 1661027033

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffebcf65000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c5058a0000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c5054d8000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c504fe6000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c504cbc000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c5049db000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c50477a000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c505827000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c50439a000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c502c3e000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c50286e000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c5025cd000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c5024a2000)
libm.so.6 => /lib64/libm.so.6 (0x000014c502120000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c501ee9000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c501cd1000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c501ab1000)
libc.so.6 => /lib64/libc.so.6 (0x000014c5016ec000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014c5014e8000)
/lib64/ld-linux-x86-64.so.2 (0x000014c5056f0000)
librt.so.1 => /lib64/librt.so.1 (0x000014c5012e0000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c50575b000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c505756000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c5011d4000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c500fca000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014c500dc6000)

Some files were not shown because too many files have changed in this diff Show More