Initial commit

This commit is contained in:
2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:04:53 BST 2022
epoch 1661216693

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007fffb8bf1000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014bae8f40000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014bae8b78000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014bae8686000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014bae835c000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014bae807b000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014bae7e1a000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014bae8ec7000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014bae7a3a000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014bae62de000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014bae5f0e000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014bae5c6d000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014bae5b42000)
libm.so.6 => /lib64/libm.so.6 (0x000014bae57c0000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014bae5589000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014bae5371000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014bae5151000)
libc.so.6 => /lib64/libc.so.6 (0x000014bae4d8c000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014bae4b88000)
/lib64/ld-linux-x86-64.so.2 (0x000014bae8d90000)
librt.so.1 => /lib64/librt.so.1 (0x000014bae4980000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014bae8dfb000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014bae8df6000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014bae4874000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014bae466a000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014bae4466000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x152200000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.564675 s : Grid Layout
Grid : Message : 1.564678 s : Global lattice size : 64 64 64 256
Grid : Message : 1.564685 s : OpenMP threads : 4
Grid : Message : 1.564688 s : MPI tasks : 2 2 2 8
Grid : Message : 1.603567 s : Making s innermost grids
Grid : Message : 1.688222 s : Initialising 4d RNG
Grid : Message : 1.786208 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.786240 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.391014 s : Initialising 5d RNG
Grid : Message : 4.831522 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.831565 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 37.988173 s : Initialised RNGs
Grid : Message : 42.605948 s : Drawing gauge field
Grid : Message : 43.494632 s : Random gauge initialised
Grid : Message : 43.507832 s : Setting up Cshift based reference
Grid : Message : 72.502242 s : *****************************************************************
Grid : Message : 72.502275 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.502277 s : *****************************************************************
Grid : Message : 72.502278 s : *****************************************************************
Grid : Message : 72.502279 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.502280 s : * Vectorising space-time by 8
Grid : Message : 72.502281 s : * VComplexF size is 64 B
Grid : Message : 72.502282 s : * SINGLE precision
Grid : Message : 72.502285 s : * Using Overlapped Comms/Compute
Grid : Message : 72.502286 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.502287 s : *****************************************************************
Grid : Message : 74.519440 s : Called warmup
Grid : Message : 363.130822 s : Called Dw 30000 times in 2.8861e+08 us
Grid : Message : 363.130900 s : mflop/s = 1.47327e+08
Grid : Message : 363.130902 s : mflop/s per rank = 2.30199e+06
Grid : Message : 363.130904 s : mflop/s per node = 9.20796e+06
Grid : Message : 363.130906 s : RF GiB/s (base 2) = 299366
Grid : Message : 363.130908 s : mem GiB/s (base 2) = 187104
Grid : Message : 363.134410 s : norm diff 1.06407e-13
Grid : Message : 363.184684 s : #### Dhop calls report
Grid : Message : 363.184691 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 363.184699 s : WilsonFermion5D TotalTime /Calls : 4813.86 us
Grid : Message : 363.184703 s : WilsonFermion5D CommTime /Calls : 3339.84 us
Grid : Message : 363.184707 s : WilsonFermion5D FaceTime /Calls : 483.232 us
Grid : Message : 363.184711 s : WilsonFermion5D ComputeTime1/Calls : 5.02507 us
Grid : Message : 363.184715 s : WilsonFermion5D ComputeTime2/Calls : 1003.93 us
Grid : Message : 363.184807 s : Average mflops/s per call : 1.25444e+11
Grid : Message : 363.184812 s : Average mflops/s per call per rank : 1.96006e+09
Grid : Message : 363.184814 s : Average mflops/s per call per node : 7.84022e+09
Grid : Message : 363.184817 s : Average mflops/s per call (full) : 1.49891e+08
Grid : Message : 363.184821 s : Average mflops/s per call per rank (full): 2.34205e+06
Grid : Message : 363.184824 s : Average mflops/s per call per node (full): 9.36819e+06
Grid : Message : 363.184828 s : WilsonFermion5D Stencil
Grid : Message : 363.184830 s : WilsonFermion5D StencilEven
Grid : Message : 363.184832 s : WilsonFermion5D StencilOdd
Grid : Message : 363.184834 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 363.184836 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 363.184838 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 418.891291 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 418.891319 s : Called DwDag
Grid : Message : 418.891320 s : norm dag result 12.0421
Grid : Message : 418.959967 s : norm dag ref 12.0421
Grid : Message : 418.976362 s : norm dag diff 7.21924e-14
Grid : Message : 419.178360 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 419.420606 s : src_e0.499998
Grid : Message : 419.876063 s : src_o0.500002
Grid : Message : 419.990496 s : *********************************************************
Grid : Message : 419.990499 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 419.990500 s : * Vectorising space-time by 8
Grid : Message : 419.990501 s : * SINGLE precision
Grid : Message : 419.990502 s : * Using Overlapped Comms/Compute
Grid : Message : 419.990503 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 419.990504 s : *********************************************************
Grid : Message : 563.432552 s : Deo mflop/s = 1.48268e+08
Grid : Message : 563.432592 s : Deo mflop/s per rank 2.31669e+06
Grid : Message : 563.432595 s : Deo mflop/s per node 9.26677e+06
Grid : Message : 563.432601 s : #### Dhop calls report
Grid : Message : 563.432603 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 563.432606 s : WilsonFermion5D TotalTime /Calls : 4781.1 us
Grid : Message : 563.432610 s : WilsonFermion5D CommTime /Calls : 3216.26 us
Grid : Message : 563.432613 s : WilsonFermion5D FaceTime /Calls : 613.962 us
Grid : Message : 563.432616 s : WilsonFermion5D ComputeTime1/Calls : 6.00683 us
Grid : Message : 563.432619 s : WilsonFermion5D ComputeTime2/Calls : 975.369 us
Grid : Message : 563.432642 s : Average mflops/s per call : 1.03322e+11
Grid : Message : 563.432645 s : Average mflops/s per call per rank : 1.6144e+09
Grid : Message : 563.432647 s : Average mflops/s per call per node : 6.45761e+09
Grid : Message : 563.432649 s : Average mflops/s per call (full) : 1.50918e+08
Grid : Message : 563.432652 s : Average mflops/s per call per rank (full): 2.35809e+06
Grid : Message : 563.432655 s : Average mflops/s per call per node (full): 9.43237e+06
Grid : Message : 563.432658 s : WilsonFermion5D Stencil
Grid : Message : 563.432661 s : WilsonFermion5D StencilEven
Grid : Message : 563.432664 s : WilsonFermion5D StencilOdd
Grid : Message : 563.432667 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 563.432669 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 563.432671 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 563.507368 s : r_e6.02108
Grid : Message : 563.514171 s : r_o6.02101
Grid : Message : 563.520846 s : res12.0421
Grid : Message : 564.221415 s : norm diff 0
Grid : Message : 564.968341 s : norm diff even 0
Grid : Message : 565.377980 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1005
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1005
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 01:55:18 BST 2022
epoch 1661216118

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:17:27 BST 2022
epoch 1661217447

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x0000149b2d1fc000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000149b2d17c000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000149b2cdba000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000149b2c8c8000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000149b2c59e000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000149b2c2bd000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000149b2c05c000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000149b2d103000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000149b2bc7c000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000149b2a520000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000149b2a150000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000149b29eaf000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000149b29d84000)
libm.so.6 => /lib64/libm.so.6 (0x0000149b29a02000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000149b297cb000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000149b295b3000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000149b29393000)
libc.so.6 => /lib64/libc.so.6 (0x0000149b28fce000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000149b28dca000)
/lib64/ld-linux-x86-64.so.2 (0x0000149b2cfd2000)
librt.so.1 => /lib64/librt.so.1 (0x0000149b28bc2000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000149b2d037000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000149b2d032000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000149b28ab6000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000149b288ac000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000149b286a8000)

View File

@ -0,0 +1,286 @@
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147620000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.489093 s : Grid Layout
Grid : Message : 1.489098 s : Global lattice size : 64 64 64 256
Grid : Message : 1.489103 s : OpenMP threads : 4
Grid : Message : 1.489104 s : MPI tasks : 2 2 2 8
Grid : Message : 1.528591 s : Making s innermost grids
Grid : Message : 1.575350 s : Initialising 4d RNG
Grid : Message : 1.669100 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.669130 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.309927 s : Initialising 5d RNG
Grid : Message : 4.777262 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.777300 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 37.984452 s : Initialised RNGs
Grid : Message : 42.778467 s : Drawing gauge field
Grid : Message : 43.470532 s : Random gauge initialised
Grid : Message : 43.486586 s : Setting up Cshift based reference
Grid : Message : 72.617045 s : *****************************************************************
Grid : Message : 72.617078 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.617080 s : *****************************************************************
Grid : Message : 72.617081 s : *****************************************************************
Grid : Message : 72.617082 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.617083 s : * Vectorising space-time by 8
Grid : Message : 72.617084 s : * VComplexF size is 64 B
Grid : Message : 72.617085 s : * SINGLE precision
Grid : Message : 72.617088 s : * Using Overlapped Comms/Compute
Grid : Message : 72.617089 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.617090 s : *****************************************************************
Grid : Message : 74.694006 s : Called warmup
Grid : Message : 361.481577 s : Called Dw 30000 times in 2.86786e+08 us
Grid : Message : 361.481630 s : mflop/s = 1.48264e+08
Grid : Message : 361.481632 s : mflop/s per rank = 2.31663e+06
Grid : Message : 361.481634 s : mflop/s per node = 9.26652e+06
Grid : Message : 361.481636 s : RF GiB/s (base 2) = 301270
Grid : Message : 361.481638 s : mem GiB/s (base 2) = 188294
Grid : Message : 361.485135 s : norm diff 1.06407e-13
Grid : Message : 361.534528 s : #### Dhop calls report
Grid : Message : 361.534535 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 361.534538 s : WilsonFermion5D TotalTime /Calls : 4782.86 us
Grid : Message : 361.534540 s : WilsonFermion5D CommTime /Calls : 3290.85 us
Grid : Message : 361.534542 s : WilsonFermion5D FaceTime /Calls : 486.032 us
Grid : Message : 361.534544 s : WilsonFermion5D ComputeTime1/Calls : 4.80859 us
Grid : Message : 361.534546 s : WilsonFermion5D ComputeTime2/Calls : 1019.4 us
Grid : Message : 361.534640 s : Average mflops/s per call : 1.22988e+11
Grid : Message : 361.534644 s : Average mflops/s per call per rank : 1.92169e+09
Grid : Message : 361.534646 s : Average mflops/s per call per node : 7.68674e+09
Grid : Message : 361.534648 s : Average mflops/s per call (full) : 1.50863e+08
Grid : Message : 361.534650 s : Average mflops/s per call per rank (full): 2.35723e+06
Grid : Message : 361.534652 s : Average mflops/s per call per node (full): 9.42891e+06
Grid : Message : 361.534654 s : WilsonFermion5D Stencil
Grid : Message : 361.534655 s : WilsonFermion5D StencilEven
Grid : Message : 361.534656 s : WilsonFermion5D StencilOdd
Grid : Message : 361.534657 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 361.534658 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 361.534659 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 417.144436 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 417.144464 s : Called DwDag
Grid : Message : 417.144465 s : norm dag result 12.0421
Grid : Message : 417.157328 s : norm dag ref 12.0421
Grid : Message : 417.173632 s : norm dag diff 7.21924e-14
Grid : Message : 417.219769 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 417.654289 s : src_e0.499998
Grid : Message : 418.528240 s : src_o0.500002
Grid : Message : 418.197825 s : *********************************************************
Grid : Message : 418.197830 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 418.197832 s : * Vectorising space-time by 8
Grid : Message : 418.197836 s : * SINGLE precision
Grid : Message : 418.197839 s : * Using Overlapped Comms/Compute
Grid : Message : 418.197841 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 418.197843 s : *********************************************************
Grid : Message : 560.488767 s : Deo mflop/s = 1.49471e+08
Grid : Message : 560.488803 s : Deo mflop/s per rank 2.33548e+06
Grid : Message : 560.488805 s : Deo mflop/s per node 9.34194e+06
Grid : Message : 560.488807 s : #### Dhop calls report
Grid : Message : 560.488809 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 560.488811 s : WilsonFermion5D TotalTime /Calls : 4742.71 us
Grid : Message : 560.488813 s : WilsonFermion5D CommTime /Calls : 3161.44 us
Grid : Message : 560.488815 s : WilsonFermion5D FaceTime /Calls : 612.972 us
Grid : Message : 560.488817 s : WilsonFermion5D ComputeTime1/Calls : 5.88747 us
Grid : Message : 560.488819 s : WilsonFermion5D ComputeTime2/Calls : 993.403 us
Grid : Message : 560.488840 s : Average mflops/s per call : 1.03167e+11
Grid : Message : 560.488844 s : Average mflops/s per call per rank : 1.61199e+09
Grid : Message : 560.488846 s : Average mflops/s per call per node : 6.44794e+09
Grid : Message : 560.488848 s : Average mflops/s per call (full) : 1.5214e+08
Grid : Message : 560.488855 s : Average mflops/s per call per rank (full): 2.37718e+06
Grid : Message : 560.488860 s : Average mflops/s per call per node (full): 9.50872e+06
Grid : Message : 560.488863 s : WilsonFermion5D Stencil
Grid : Message : 560.488865 s : WilsonFermion5D StencilEven
Grid : Message : 560.488868 s : WilsonFermion5D StencilOdd
Grid : Message : 560.488873 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 560.488876 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 560.488878 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 560.561334 s : r_e6.02108
Grid : Message : 560.569153 s : r_o6.02101
Grid : Message : 560.575803 s : res12.0421
Grid : Message : 561.377555 s : norm diff 0
Grid : Message : 562.683230 s : norm diff even 0
Grid : Message : 562.467576 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1020
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1020
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:07:55 BST 2022
epoch 1661216875

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:30:01 BST 2022
epoch 1661218201

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffd49dba000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001478590e3000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000147858d1b000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000147858829000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001478584ff000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014785821e000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000147857fbd000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014785906a000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000147857bdd000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000147856481000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001478560b1000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000147855e10000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000147855ce5000)
libm.so.6 => /lib64/libm.so.6 (0x0000147855963000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014785572c000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000147855514000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001478552f4000)
libc.so.6 => /lib64/libc.so.6 (0x0000147854f2f000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000147854d2b000)
/lib64/ld-linux-x86-64.so.2 (0x0000147858f33000)
librt.so.1 => /lib64/librt.so.1 (0x0000147854b23000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000147858f9e000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000147858f99000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000147854a17000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014785480d000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000147854609000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e320000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.519784 s : Grid Layout
Grid : Message : 1.519788 s : Global lattice size : 64 64 64 256
Grid : Message : 1.519795 s : OpenMP threads : 4
Grid : Message : 1.519797 s : MPI tasks : 2 2 2 8
Grid : Message : 1.559178 s : Making s innermost grids
Grid : Message : 1.633619 s : Initialising 4d RNG
Grid : Message : 1.730190 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.730225 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.601962 s : Initialising 5d RNG
Grid : Message : 5.128210 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 5.128840 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 38.200978 s : Initialised RNGs
Grid : Message : 42.335127 s : Drawing gauge field
Grid : Message : 43.156005 s : Random gauge initialised
Grid : Message : 43.178252 s : Setting up Cshift based reference
Grid : Message : 72.129059 s : *****************************************************************
Grid : Message : 72.129086 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.129088 s : *****************************************************************
Grid : Message : 72.129089 s : *****************************************************************
Grid : Message : 72.129090 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.129091 s : * Vectorising space-time by 8
Grid : Message : 72.129092 s : * VComplexF size is 64 B
Grid : Message : 72.129093 s : * SINGLE precision
Grid : Message : 72.129096 s : * Using Overlapped Comms/Compute
Grid : Message : 72.129097 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.129098 s : *****************************************************************
Grid : Message : 74.239965 s : Called warmup
Grid : Message : 358.784280 s : Called Dw 30000 times in 2.84544e+08 us
Grid : Message : 358.784334 s : mflop/s = 1.49433e+08
Grid : Message : 358.784336 s : mflop/s per rank = 2.33489e+06
Grid : Message : 358.784338 s : mflop/s per node = 9.33955e+06
Grid : Message : 358.784340 s : RF GiB/s (base 2) = 303644
Grid : Message : 358.784342 s : mem GiB/s (base 2) = 189778
Grid : Message : 358.787842 s : norm diff 1.06407e-13
Grid : Message : 358.838249 s : #### Dhop calls report
Grid : Message : 358.838256 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 358.838260 s : WilsonFermion5D TotalTime /Calls : 4746.18 us
Grid : Message : 358.838262 s : WilsonFermion5D CommTime /Calls : 3270.98 us
Grid : Message : 358.838264 s : WilsonFermion5D FaceTime /Calls : 483.168 us
Grid : Message : 358.838266 s : WilsonFermion5D ComputeTime1/Calls : 4.63241 us
Grid : Message : 358.838268 s : WilsonFermion5D ComputeTime2/Calls : 1005.55 us
Grid : Message : 358.838374 s : Average mflops/s per call : 1.24606e+11
Grid : Message : 358.838377 s : Average mflops/s per call per rank : 1.94697e+09
Grid : Message : 358.838379 s : Average mflops/s per call per node : 7.78786e+09
Grid : Message : 358.838381 s : Average mflops/s per call (full) : 1.52028e+08
Grid : Message : 358.838388 s : Average mflops/s per call per rank (full): 2.37544e+06
Grid : Message : 358.838391 s : Average mflops/s per call per node (full): 9.50177e+06
Grid : Message : 358.838394 s : WilsonFermion5D Stencil
Grid : Message : 358.838396 s : WilsonFermion5D StencilEven
Grid : Message : 358.838399 s : WilsonFermion5D StencilOdd
Grid : Message : 358.838401 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 358.838404 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 358.838405 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 414.333761 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 414.333785 s : Called DwDag
Grid : Message : 414.333786 s : norm dag result 12.0421
Grid : Message : 414.361728 s : norm dag ref 12.0421
Grid : Message : 414.377939 s : norm dag diff 7.21924e-14
Grid : Message : 414.419027 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 414.775952 s : src_e0.499998
Grid : Message : 415.196810 s : src_o0.500002
Grid : Message : 415.306157 s : *********************************************************
Grid : Message : 415.306160 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 415.306161 s : * Vectorising space-time by 8
Grid : Message : 415.306162 s : * SINGLE precision
Grid : Message : 415.306163 s : * Using Overlapped Comms/Compute
Grid : Message : 415.306164 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 415.306165 s : *********************************************************
Grid : Message : 556.694065 s : Deo mflop/s = 1.50429e+08
Grid : Message : 556.694096 s : Deo mflop/s per rank 2.35046e+06
Grid : Message : 556.694098 s : Deo mflop/s per node 9.40183e+06
Grid : Message : 556.694101 s : #### Dhop calls report
Grid : Message : 556.694103 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 556.694105 s : WilsonFermion5D TotalTime /Calls : 4712.63 us
Grid : Message : 556.694107 s : WilsonFermion5D CommTime /Calls : 3147.17 us
Grid : Message : 556.694109 s : WilsonFermion5D FaceTime /Calls : 604.892 us
Grid : Message : 556.694111 s : WilsonFermion5D ComputeTime1/Calls : 5.86504 us
Grid : Message : 556.694113 s : WilsonFermion5D ComputeTime2/Calls : 986.801 us
Grid : Message : 556.694138 s : Average mflops/s per call : 1.02948e+11
Grid : Message : 556.694141 s : Average mflops/s per call per rank : 1.60857e+09
Grid : Message : 556.694143 s : Average mflops/s per call per node : 6.43427e+09
Grid : Message : 556.694145 s : Average mflops/s per call (full) : 1.53111e+08
Grid : Message : 556.694147 s : Average mflops/s per call per rank (full): 2.39236e+06
Grid : Message : 556.694151 s : Average mflops/s per call per node (full): 9.56943e+06
Grid : Message : 556.694154 s : WilsonFermion5D Stencil
Grid : Message : 556.694155 s : WilsonFermion5D StencilEven
Grid : Message : 556.694157 s : WilsonFermion5D StencilOdd
Grid : Message : 556.694158 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 556.694159 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 556.694160 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 556.767541 s : r_e6.02108
Grid : Message : 556.776129 s : r_o6.02101
Grid : Message : 556.782720 s : res12.0421
Grid : Message : 557.406532 s : norm diff 0
Grid : Message : 558.192507 s : norm diff even 0
Grid : Message : 558.691926 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1035
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1035
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:20:32 BST 2022
epoch 1661217632

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:42:29 BST 2022
epoch 1661218949

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe50742000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014bad727e000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014bad6eb6000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014bad69c4000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014bad669a000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014bad63b9000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014bad6158000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014bad7205000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014bad5d78000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014bad461c000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014bad424c000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014bad3fab000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014bad3e80000)
libm.so.6 => /lib64/libm.so.6 (0x000014bad3afe000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014bad38c7000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014bad36af000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014bad348f000)
libc.so.6 => /lib64/libc.so.6 (0x000014bad30ca000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014bad2ec6000)
/lib64/ld-linux-x86-64.so.2 (0x000014bad70ce000)
librt.so.1 => /lib64/librt.so.1 (0x000014bad2cbe000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014bad7139000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014bad7134000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014bad2bb2000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014bad29a8000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014bad27a4000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1508a0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.547283 s : Grid Layout
Grid : Message : 1.547287 s : Global lattice size : 64 64 64 256
Grid : Message : 1.547292 s : OpenMP threads : 4
Grid : Message : 1.547294 s : MPI tasks : 2 2 2 8
Grid : Message : 1.587435 s : Making s innermost grids
Grid : Message : 1.643389 s : Initialising 4d RNG
Grid : Message : 1.736475 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.736504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.526644 s : Initialising 5d RNG
Grid : Message : 5.205300 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 5.211500 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 38.189754 s : Initialised RNGs
Grid : Message : 42.704030 s : Drawing gauge field
Grid : Message : 43.504353 s : Random gauge initialised
Grid : Message : 43.515972 s : Setting up Cshift based reference
Grid : Message : 72.524541 s : *****************************************************************
Grid : Message : 72.524570 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.524572 s : *****************************************************************
Grid : Message : 72.524579 s : *****************************************************************
Grid : Message : 72.524581 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.524584 s : * Vectorising space-time by 8
Grid : Message : 72.524587 s : * VComplexF size is 64 B
Grid : Message : 72.524589 s : * SINGLE precision
Grid : Message : 72.524592 s : * Using Overlapped Comms/Compute
Grid : Message : 72.524593 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.524595 s : *****************************************************************
Grid : Message : 74.535913 s : Called warmup
Grid : Message : 358.201124 s : Called Dw 30000 times in 2.83665e+08 us
Grid : Message : 358.201171 s : mflop/s = 1.49896e+08
Grid : Message : 358.201174 s : mflop/s per rank = 2.34212e+06
Grid : Message : 358.201180 s : mflop/s per node = 9.36848e+06
Grid : Message : 358.201183 s : RF GiB/s (base 2) = 304585
Grid : Message : 358.201186 s : mem GiB/s (base 2) = 190365
Grid : Message : 358.204691 s : norm diff 1.06407e-13
Grid : Message : 358.255404 s : #### Dhop calls report
Grid : Message : 358.255412 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 358.255417 s : WilsonFermion5D TotalTime /Calls : 4731.41 us
Grid : Message : 358.255421 s : WilsonFermion5D CommTime /Calls : 3246.57 us
Grid : Message : 358.255424 s : WilsonFermion5D FaceTime /Calls : 480.383 us
Grid : Message : 358.255427 s : WilsonFermion5D ComputeTime1/Calls : 4.99283 us
Grid : Message : 358.255431 s : WilsonFermion5D ComputeTime2/Calls : 1017.68 us
Grid : Message : 358.255489 s : Average mflops/s per call : 1.25152e+11
Grid : Message : 358.255493 s : Average mflops/s per call per rank : 1.9555e+09
Grid : Message : 358.255495 s : Average mflops/s per call per node : 7.82198e+09
Grid : Message : 358.255498 s : Average mflops/s per call (full) : 1.52503e+08
Grid : Message : 358.255501 s : Average mflops/s per call per rank (full): 2.38286e+06
Grid : Message : 358.255504 s : Average mflops/s per call per node (full): 9.53145e+06
Grid : Message : 358.255508 s : WilsonFermion5D Stencil
Grid : Message : 358.255510 s : WilsonFermion5D StencilEven
Grid : Message : 358.255513 s : WilsonFermion5D StencilOdd
Grid : Message : 358.255516 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 358.255518 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 358.255521 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 413.645728 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 413.645754 s : Called DwDag
Grid : Message : 413.645755 s : norm dag result 12.0421
Grid : Message : 413.669832 s : norm dag ref 12.0421
Grid : Message : 413.685970 s : norm dag diff 7.21924e-14
Grid : Message : 413.728176 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 414.102420 s : src_e0.499998
Grid : Message : 414.504152 s : src_o0.500002
Grid : Message : 414.602493 s : *********************************************************
Grid : Message : 414.602496 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 414.602497 s : * Vectorising space-time by 8
Grid : Message : 414.602498 s : * SINGLE precision
Grid : Message : 414.602499 s : * Using Overlapped Comms/Compute
Grid : Message : 414.602501 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 414.602502 s : *********************************************************
Grid : Message : 555.126889 s : Deo mflop/s = 1.51345e+08
Grid : Message : 555.126928 s : Deo mflop/s per rank 2.36476e+06
Grid : Message : 555.126930 s : Deo mflop/s per node 9.45904e+06
Grid : Message : 555.126933 s : #### Dhop calls report
Grid : Message : 555.126935 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 555.126937 s : WilsonFermion5D TotalTime /Calls : 4683.8 us
Grid : Message : 555.126939 s : WilsonFermion5D CommTime /Calls : 3108.43 us
Grid : Message : 555.126941 s : WilsonFermion5D FaceTime /Calls : 597.314 us
Grid : Message : 555.126943 s : WilsonFermion5D ComputeTime1/Calls : 5.94274 us
Grid : Message : 555.126945 s : WilsonFermion5D ComputeTime2/Calls : 1003.24 us
Grid : Message : 555.126973 s : Average mflops/s per call : 1.04341e+11
Grid : Message : 555.126978 s : Average mflops/s per call per rank : 1.63033e+09
Grid : Message : 555.126984 s : Average mflops/s per call per node : 6.52133e+09
Grid : Message : 555.126989 s : Average mflops/s per call (full) : 1.54053e+08
Grid : Message : 555.126991 s : Average mflops/s per call per rank (full): 2.40708e+06
Grid : Message : 555.126996 s : Average mflops/s per call per node (full): 9.62833e+06
Grid : Message : 555.126998 s : WilsonFermion5D Stencil
Grid : Message : 555.127001 s : WilsonFermion5D StencilEven
Grid : Message : 555.127003 s : WilsonFermion5D StencilOdd
Grid : Message : 555.127006 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 555.127008 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 555.127011 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 555.200537 s : r_e6.02108
Grid : Message : 555.207490 s : r_o6.02101
Grid : Message : 555.214018 s : res12.0421
Grid : Message : 555.847686 s : norm diff 0
Grid : Message : 556.597525 s : norm diff even 0
Grid : Message : 557.711160 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1050
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1050
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:33:02 BST 2022
epoch 1661218382

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:54:59 BST 2022
epoch 1661219699

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffec9b2e000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470ad4fc000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470ad134000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470acc42000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470ac918000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470ac637000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470ac3d6000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470ad483000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470abff6000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470aa89a000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470aa4ca000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470aa229000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470aa0fe000)
libm.so.6 => /lib64/libm.so.6 (0x00001470a9d7c000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470a9b45000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470a992d000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470a970d000)
libc.so.6 => /lib64/libc.so.6 (0x00001470a9348000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001470a9144000)
/lib64/ld-linux-x86-64.so.2 (0x00001470ad34c000)
librt.so.1 => /lib64/librt.so.1 (0x00001470a8f3c000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470ad3b7000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470ad3b2000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470a8e30000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470a8c26000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001470a8a22000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14eca0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.674336 s : Grid Layout
Grid : Message : 1.674340 s : Global lattice size : 64 64 64 256
Grid : Message : 1.674346 s : OpenMP threads : 4
Grid : Message : 1.674349 s : MPI tasks : 2 2 2 8
Grid : Message : 1.716560 s : Making s innermost grids
Grid : Message : 1.771902 s : Initialising 4d RNG
Grid : Message : 1.868575 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.868615 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.637285 s : Initialising 5d RNG
Grid : Message : 5.332800 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 5.333390 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 39.118671 s : Initialised RNGs
Grid : Message : 42.869064 s : Drawing gauge field
Grid : Message : 43.712953 s : Random gauge initialised
Grid : Message : 43.724865 s : Setting up Cshift based reference
Grid : Message : 72.822608 s : *****************************************************************
Grid : Message : 72.822634 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.822636 s : *****************************************************************
Grid : Message : 72.822637 s : *****************************************************************
Grid : Message : 72.822638 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.822639 s : * Vectorising space-time by 8
Grid : Message : 72.822640 s : * VComplexF size is 64 B
Grid : Message : 72.822641 s : * SINGLE precision
Grid : Message : 72.822644 s : * Using Overlapped Comms/Compute
Grid : Message : 72.822645 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.822646 s : *****************************************************************
Grid : Message : 74.927617 s : Called warmup
Grid : Message : 359.291971 s : Called Dw 30000 times in 2.84363e+08 us
Grid : Message : 359.292022 s : mflop/s = 1.49528e+08
Grid : Message : 359.292025 s : mflop/s per rank = 2.33637e+06
Grid : Message : 359.292032 s : mflop/s per node = 9.34548e+06
Grid : Message : 359.292035 s : RF GiB/s (base 2) = 303837
Grid : Message : 359.292038 s : mem GiB/s (base 2) = 189898
Grid : Message : 359.295542 s : norm diff 1.06407e-13
Grid : Message : 359.345144 s : #### Dhop calls report
Grid : Message : 359.345152 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 359.345158 s : WilsonFermion5D TotalTime /Calls : 4743.11 us
Grid : Message : 359.345162 s : WilsonFermion5D CommTime /Calls : 3276.42 us
Grid : Message : 359.345165 s : WilsonFermion5D FaceTime /Calls : 480.328 us
Grid : Message : 359.345167 s : WilsonFermion5D ComputeTime1/Calls : 4.57995 us
Grid : Message : 359.345169 s : WilsonFermion5D ComputeTime2/Calls : 999.257 us
Grid : Message : 359.345237 s : Average mflops/s per call : 1.2427e+11
Grid : Message : 359.345241 s : Average mflops/s per call per rank : 1.94172e+09
Grid : Message : 359.345244 s : Average mflops/s per call per node : 7.7669e+09
Grid : Message : 359.345247 s : Average mflops/s per call (full) : 1.52127e+08
Grid : Message : 359.345251 s : Average mflops/s per call per rank (full): 2.37698e+06
Grid : Message : 359.345253 s : Average mflops/s per call per node (full): 9.50794e+06
Grid : Message : 359.345256 s : WilsonFermion5D Stencil
Grid : Message : 359.345258 s : WilsonFermion5D StencilEven
Grid : Message : 359.345260 s : WilsonFermion5D StencilOdd
Grid : Message : 359.345262 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 359.345264 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 359.345266 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 414.850960 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 414.850985 s : Called DwDag
Grid : Message : 414.850986 s : norm dag result 12.0421
Grid : Message : 414.876032 s : norm dag ref 12.0421
Grid : Message : 414.892131 s : norm dag diff 7.21924e-14
Grid : Message : 414.932833 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 415.336415 s : src_e0.499998
Grid : Message : 415.784915 s : src_o0.500002
Grid : Message : 415.911800 s : *********************************************************
Grid : Message : 415.911803 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 415.911804 s : * Vectorising space-time by 8
Grid : Message : 415.911805 s : * SINGLE precision
Grid : Message : 415.911806 s : * Using Overlapped Comms/Compute
Grid : Message : 415.911807 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 415.911808 s : *********************************************************
Grid : Message : 556.278711 s : Deo mflop/s = 1.51519e+08
Grid : Message : 556.278742 s : Deo mflop/s per rank 2.36748e+06
Grid : Message : 556.278745 s : Deo mflop/s per node 9.46991e+06
Grid : Message : 556.278751 s : #### Dhop calls report
Grid : Message : 556.278753 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 556.278757 s : WilsonFermion5D TotalTime /Calls : 4678.6 us
Grid : Message : 556.278761 s : WilsonFermion5D CommTime /Calls : 3123.43 us
Grid : Message : 556.278764 s : WilsonFermion5D FaceTime /Calls : 594.933 us
Grid : Message : 556.278768 s : WilsonFermion5D ComputeTime1/Calls : 6.01337 us
Grid : Message : 556.278771 s : WilsonFermion5D ComputeTime2/Calls : 985.509 us
Grid : Message : 556.278793 s : Average mflops/s per call : 1.02827e+11
Grid : Message : 556.278797 s : Average mflops/s per call per rank : 1.60668e+09
Grid : Message : 556.278799 s : Average mflops/s per call per node : 6.42671e+09
Grid : Message : 556.278801 s : Average mflops/s per call (full) : 1.54225e+08
Grid : Message : 556.278805 s : Average mflops/s per call per rank (full): 2.40976e+06
Grid : Message : 556.278809 s : Average mflops/s per call per node (full): 9.63904e+06
Grid : Message : 556.278812 s : WilsonFermion5D Stencil
Grid : Message : 556.278815 s : WilsonFermion5D StencilEven
Grid : Message : 556.278817 s : WilsonFermion5D StencilOdd
Grid : Message : 556.278820 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 556.278823 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 556.278825 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 556.353644 s : r_e6.02108
Grid : Message : 556.361051 s : r_o6.02101
Grid : Message : 556.367550 s : res12.0421
Grid : Message : 557.632190 s : norm diff 0
Grid : Message : 557.851410 s : norm diff even 0
Grid : Message : 558.237043 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1065
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1065
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:45:31 BST 2022
epoch 1661219131

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:07:25 BST 2022
epoch 1661220445

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe74893000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014d458e71000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014d458aa9000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014d4585b7000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014d45828d000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014d457fac000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014d457d4b000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014d458df8000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014d45796b000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014d45620f000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014d455e3f000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014d455b9e000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014d455a73000)
libm.so.6 => /lib64/libm.so.6 (0x000014d4556f1000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014d4554ba000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014d4552a2000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014d455082000)
libc.so.6 => /lib64/libc.so.6 (0x000014d454cbd000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014d454ab9000)
/lib64/ld-linux-x86-64.so.2 (0x000014d458cc1000)
librt.so.1 => /lib64/librt.so.1 (0x000014d4548b1000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014d458d2c000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014d458d27000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014d4547a5000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014d45459b000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014d454397000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e280000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.460224 s : Grid Layout
Grid : Message : 1.460230 s : Global lattice size : 64 64 64 256
Grid : Message : 1.460237 s : OpenMP threads : 4
Grid : Message : 1.460239 s : MPI tasks : 2 2 2 8
Grid : Message : 1.501038 s : Making s innermost grids
Grid : Message : 1.548447 s : Initialising 4d RNG
Grid : Message : 1.641350 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.641379 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.415745 s : Initialising 5d RNG
Grid : Message : 4.858093 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.858127 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 38.377226 s : Initialised RNGs
Grid : Message : 42.241334 s : Drawing gauge field
Grid : Message : 43.676130 s : Random gauge initialised
Grid : Message : 43.793380 s : Setting up Cshift based reference
Grid : Message : 72.567410 s : *****************************************************************
Grid : Message : 72.567680 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.567700 s : *****************************************************************
Grid : Message : 72.567710 s : *****************************************************************
Grid : Message : 72.567720 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.567730 s : * Vectorising space-time by 8
Grid : Message : 72.567740 s : * VComplexF size is 64 B
Grid : Message : 72.567750 s : * SINGLE precision
Grid : Message : 72.567780 s : * Using Overlapped Comms/Compute
Grid : Message : 72.567790 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.567800 s : *****************************************************************
Grid : Message : 74.181282 s : Called warmup
Grid : Message : 356.575277 s : Called Dw 30000 times in 2.82393e+08 us
Grid : Message : 356.575346 s : mflop/s = 1.50571e+08
Grid : Message : 356.575348 s : mflop/s per rank = 2.35267e+06
Grid : Message : 356.575350 s : mflop/s per node = 9.41067e+06
Grid : Message : 356.575352 s : RF GiB/s (base 2) = 305956
Grid : Message : 356.575354 s : mem GiB/s (base 2) = 191223
Grid : Message : 356.578856 s : norm diff 1.06407e-13
Grid : Message : 356.627867 s : #### Dhop calls report
Grid : Message : 356.627875 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 356.627879 s : WilsonFermion5D TotalTime /Calls : 4710.82 us
Grid : Message : 356.627881 s : WilsonFermion5D CommTime /Calls : 3226.37 us
Grid : Message : 356.627883 s : WilsonFermion5D FaceTime /Calls : 477.831 us
Grid : Message : 356.627885 s : WilsonFermion5D ComputeTime1/Calls : 5.85034 us
Grid : Message : 356.627887 s : WilsonFermion5D ComputeTime2/Calls : 1019.91 us
Grid : Message : 356.627914 s : Average mflops/s per call : 1.229e+11
Grid : Message : 356.627918 s : Average mflops/s per call per rank : 1.92031e+09
Grid : Message : 356.627920 s : Average mflops/s per call per node : 7.68122e+09
Grid : Message : 356.627922 s : Average mflops/s per call (full) : 1.5317e+08
Grid : Message : 356.627927 s : Average mflops/s per call per rank (full): 2.39328e+06
Grid : Message : 356.627930 s : Average mflops/s per call per node (full): 9.5731e+06
Grid : Message : 356.627932 s : WilsonFermion5D Stencil
Grid : Message : 356.627934 s : WilsonFermion5D StencilEven
Grid : Message : 356.627935 s : WilsonFermion5D StencilOdd
Grid : Message : 356.627936 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 356.627938 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 356.627940 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 411.993212 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 411.993238 s : Called DwDag
Grid : Message : 411.993240 s : norm dag result 12.0421
Grid : Message : 412.893700 s : norm dag ref 12.0421
Grid : Message : 412.249000 s : norm dag diff 7.21924e-14
Grid : Message : 412.655740 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 412.459794 s : src_e0.499998
Grid : Message : 412.869558 s : src_o0.500002
Grid : Message : 412.998877 s : *********************************************************
Grid : Message : 412.998881 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 412.998883 s : * Vectorising space-time by 8
Grid : Message : 412.998887 s : * SINGLE precision
Grid : Message : 412.998889 s : * Using Overlapped Comms/Compute
Grid : Message : 412.998892 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 412.998894 s : *********************************************************
Grid : Message : 552.599242 s : Deo mflop/s = 1.52349e+08
Grid : Message : 552.599275 s : Deo mflop/s per rank 2.38045e+06
Grid : Message : 552.599277 s : Deo mflop/s per node 9.52178e+06
Grid : Message : 552.599280 s : #### Dhop calls report
Grid : Message : 552.599282 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 552.599284 s : WilsonFermion5D TotalTime /Calls : 4653.08 us
Grid : Message : 552.599286 s : WilsonFermion5D CommTime /Calls : 3098.34 us
Grid : Message : 552.599288 s : WilsonFermion5D FaceTime /Calls : 594.442 us
Grid : Message : 552.599290 s : WilsonFermion5D ComputeTime1/Calls : 5.8861 us
Grid : Message : 552.599292 s : WilsonFermion5D ComputeTime2/Calls : 985.791 us
Grid : Message : 552.599320 s : Average mflops/s per call : 1.0387e+11
Grid : Message : 552.599324 s : Average mflops/s per call per rank : 1.62297e+09
Grid : Message : 552.599327 s : Average mflops/s per call per node : 6.49188e+09
Grid : Message : 552.599330 s : Average mflops/s per call (full) : 1.5507e+08
Grid : Message : 552.599333 s : Average mflops/s per call per rank (full): 2.42297e+06
Grid : Message : 552.599336 s : Average mflops/s per call per node (full): 9.69189e+06
Grid : Message : 552.599340 s : WilsonFermion5D Stencil
Grid : Message : 552.599341 s : WilsonFermion5D StencilEven
Grid : Message : 552.599344 s : WilsonFermion5D StencilOdd
Grid : Message : 552.599345 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 552.599347 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 552.599350 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 552.670528 s : r_e6.02108
Grid : Message : 552.680935 s : r_o6.02101
Grid : Message : 552.687380 s : res12.0421
Grid : Message : 553.325113 s : norm diff 0
Grid : Message : 554.100564 s : norm diff even 0
Grid : Message : 554.561181 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1080
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1080
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:58:01 BST 2022
epoch 1661219881

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:19:51 BST 2022
epoch 1661221191

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffcebffa000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014f17865e000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014f178296000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014f177da4000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014f177a7a000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014f177799000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014f177538000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014f1785e5000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014f177158000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014f1759fc000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014f17562c000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014f17538b000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014f175260000)
libm.so.6 => /lib64/libm.so.6 (0x000014f174ede000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014f174ca7000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014f174a8f000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014f17486f000)
libc.so.6 => /lib64/libc.so.6 (0x000014f1744aa000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014f1742a6000)
/lib64/ld-linux-x86-64.so.2 (0x000014f1784ae000)
librt.so.1 => /lib64/librt.so.1 (0x000014f17409e000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014f178519000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014f178514000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014f173f92000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014f173d88000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014f173b84000)

View File

@ -0,0 +1,286 @@
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x148ce0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.503426 s : Grid Layout
Grid : Message : 1.503430 s : Global lattice size : 64 64 64 256
Grid : Message : 1.503436 s : OpenMP threads : 4
Grid : Message : 1.503438 s : MPI tasks : 2 2 2 8
Grid : Message : 1.545288 s : Making s innermost grids
Grid : Message : 1.600811 s : Initialising 4d RNG
Grid : Message : 1.693389 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.693424 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.322964 s : Initialising 5d RNG
Grid : Message : 4.759541 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.759582 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 37.877117 s : Initialised RNGs
Grid : Message : 42.920224 s : Drawing gauge field
Grid : Message : 43.530915 s : Random gauge initialised
Grid : Message : 43.542723 s : Setting up Cshift based reference
Grid : Message : 72.809099 s : *****************************************************************
Grid : Message : 72.809129 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.809131 s : *****************************************************************
Grid : Message : 72.809132 s : *****************************************************************
Grid : Message : 72.809133 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.809134 s : * Vectorising space-time by 8
Grid : Message : 72.809135 s : * VComplexF size is 64 B
Grid : Message : 72.809136 s : * SINGLE precision
Grid : Message : 72.809139 s : * Using Overlapped Comms/Compute
Grid : Message : 72.809140 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.809141 s : *****************************************************************
Grid : Message : 74.811618 s : Called warmup
Grid : Message : 356.437280 s : Called Dw 30000 times in 2.81231e+08 us
Grid : Message : 356.437910 s : mflop/s = 1.51193e+08
Grid : Message : 356.437940 s : mflop/s per rank = 2.36239e+06
Grid : Message : 356.437990 s : mflop/s per node = 9.44956e+06
Grid : Message : 356.438020 s : RF GiB/s (base 2) = 307220
Grid : Message : 356.438050 s : mem GiB/s (base 2) = 192013
Grid : Message : 356.473120 s : norm diff 1.06407e-13
Grid : Message : 356.964660 s : #### Dhop calls report
Grid : Message : 356.964730 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 356.964770 s : WilsonFermion5D TotalTime /Calls : 4690.91 us
Grid : Message : 356.964790 s : WilsonFermion5D CommTime /Calls : 3217.42 us
Grid : Message : 356.964810 s : WilsonFermion5D FaceTime /Calls : 478.163 us
Grid : Message : 356.964830 s : WilsonFermion5D ComputeTime1/Calls : 4.80376 us
Grid : Message : 356.964850 s : WilsonFermion5D ComputeTime2/Calls : 1009.04 us
Grid : Message : 356.965410 s : Average mflops/s per call : 1.24517e+11
Grid : Message : 356.965450 s : Average mflops/s per call per rank : 1.94558e+09
Grid : Message : 356.965470 s : Average mflops/s per call per node : 7.78233e+09
Grid : Message : 356.965490 s : Average mflops/s per call (full) : 1.5382e+08
Grid : Message : 356.965510 s : Average mflops/s per call per rank (full): 2.40343e+06
Grid : Message : 356.965530 s : Average mflops/s per call per node (full): 9.61373e+06
Grid : Message : 356.965550 s : WilsonFermion5D Stencil
Grid : Message : 356.965560 s : WilsonFermion5D StencilEven
Grid : Message : 356.965570 s : WilsonFermion5D StencilOdd
Grid : Message : 356.965580 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 356.965590 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 356.965600 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 411.545363 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 411.545389 s : Called DwDag
Grid : Message : 411.545390 s : norm dag result 12.0421
Grid : Message : 411.583980 s : norm dag ref 12.0421
Grid : Message : 411.599853 s : norm dag diff 7.21924e-14
Grid : Message : 411.641431 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 412.849100 s : src_e0.499998
Grid : Message : 412.456953 s : src_o0.500002
Grid : Message : 412.561709 s : *********************************************************
Grid : Message : 412.561712 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 412.561713 s : * Vectorising space-time by 8
Grid : Message : 412.561714 s : * SINGLE precision
Grid : Message : 412.561715 s : * Using Overlapped Comms/Compute
Grid : Message : 412.561716 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 412.561717 s : *********************************************************
Grid : Message : 551.425534 s : Deo mflop/s = 1.53159e+08
Grid : Message : 551.425571 s : Deo mflop/s per rank 2.39311e+06
Grid : Message : 551.425573 s : Deo mflop/s per node 9.57245e+06
Grid : Message : 551.425576 s : #### Dhop calls report
Grid : Message : 551.425578 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 551.425580 s : WilsonFermion5D TotalTime /Calls : 4628.49 us
Grid : Message : 551.425582 s : WilsonFermion5D CommTime /Calls : 3073.6 us
Grid : Message : 551.425584 s : WilsonFermion5D FaceTime /Calls : 589.193 us
Grid : Message : 551.425586 s : WilsonFermion5D ComputeTime1/Calls : 5.908 us
Grid : Message : 551.425588 s : WilsonFermion5D ComputeTime2/Calls : 991.151 us
Grid : Message : 551.425610 s : Average mflops/s per call : 1.04262e+11
Grid : Message : 551.425614 s : Average mflops/s per call per rank : 1.6291e+09
Grid : Message : 551.425616 s : Average mflops/s per call per node : 6.5164e+09
Grid : Message : 551.425618 s : Average mflops/s per call (full) : 1.55894e+08
Grid : Message : 551.425625 s : Average mflops/s per call per rank (full): 2.43585e+06
Grid : Message : 551.425630 s : Average mflops/s per call per node (full): 9.74338e+06
Grid : Message : 551.425633 s : WilsonFermion5D Stencil
Grid : Message : 551.425635 s : WilsonFermion5D StencilEven
Grid : Message : 551.425638 s : WilsonFermion5D StencilOdd
Grid : Message : 551.425639 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 551.425641 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 551.425643 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 551.498308 s : r_e6.02108
Grid : Message : 551.506233 s : r_o6.02101
Grid : Message : 551.512628 s : res12.0421
Grid : Message : 552.147704 s : norm diff 0
Grid : Message : 553.522450 s : norm diff even 0
Grid : Message : 553.479623 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1095
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1095
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:10:28 BST 2022
epoch 1661220628

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:32:16 BST 2022
epoch 1661221936

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe30aa7000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015253eefc000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015253eb34000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015253e642000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015253e318000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015253e037000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015253ddd6000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015253ee83000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015253d9f6000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000015253c29a000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015253beca000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015253bc29000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015253bafe000)
libm.so.6 => /lib64/libm.so.6 (0x000015253b77c000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015253b545000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015253b32d000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000015253b10d000)
libc.so.6 => /lib64/libc.so.6 (0x000015253ad48000)
libdl.so.2 => /lib64/libdl.so.2 (0x000015253ab44000)
/lib64/ld-linux-x86-64.so.2 (0x000015253ed4c000)
librt.so.1 => /lib64/librt.so.1 (0x000015253a93c000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015253edb7000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015253edb2000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015253a830000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015253a626000)
libutil.so.1 => /lib64/libutil.so.1 (0x000015253a422000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14fe00000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.604389 s : Grid Layout
Grid : Message : 1.604394 s : Global lattice size : 64 64 64 256
Grid : Message : 1.604403 s : OpenMP threads : 4
Grid : Message : 1.604406 s : MPI tasks : 2 2 2 8
Grid : Message : 1.643570 s : Making s innermost grids
Grid : Message : 1.694630 s : Initialising 4d RNG
Grid : Message : 1.787050 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.787079 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.366084 s : Initialising 5d RNG
Grid : Message : 4.819386 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.819425 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 38.300660 s : Initialised RNGs
Grid : Message : 43.255423 s : Drawing gauge field
Grid : Message : 44.115774 s : Random gauge initialised
Grid : Message : 44.127168 s : Setting up Cshift based reference
Grid : Message : 72.933387 s : *****************************************************************
Grid : Message : 72.933414 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.933416 s : *****************************************************************
Grid : Message : 72.933417 s : *****************************************************************
Grid : Message : 72.933418 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.933419 s : * Vectorising space-time by 8
Grid : Message : 72.933420 s : * VComplexF size is 64 B
Grid : Message : 72.933422 s : * SINGLE precision
Grid : Message : 72.933425 s : * Using Overlapped Comms/Compute
Grid : Message : 72.933426 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.933427 s : *****************************************************************
Grid : Message : 74.920872 s : Called warmup
Grid : Message : 355.775111 s : Called Dw 30000 times in 2.80853e+08 us
Grid : Message : 355.775156 s : mflop/s = 1.51396e+08
Grid : Message : 355.775158 s : mflop/s per rank = 2.36557e+06
Grid : Message : 355.775160 s : mflop/s per node = 9.46227e+06
Grid : Message : 355.775162 s : RF GiB/s (base 2) = 307634
Grid : Message : 355.775164 s : mem GiB/s (base 2) = 192271
Grid : Message : 355.778673 s : norm diff 1.06407e-13
Grid : Message : 355.827430 s : #### Dhop calls report
Grid : Message : 355.827437 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 355.827440 s : WilsonFermion5D TotalTime /Calls : 4684.11 us
Grid : Message : 355.827442 s : WilsonFermion5D CommTime /Calls : 3200.33 us
Grid : Message : 355.827444 s : WilsonFermion5D FaceTime /Calls : 479.567 us
Grid : Message : 355.827446 s : WilsonFermion5D ComputeTime1/Calls : 4.82936 us
Grid : Message : 355.827448 s : WilsonFermion5D ComputeTime2/Calls : 1018.33 us
Grid : Message : 355.827545 s : Average mflops/s per call : 1.24896e+11
Grid : Message : 355.827549 s : Average mflops/s per call per rank : 1.9515e+09
Grid : Message : 355.827551 s : Average mflops/s per call per node : 7.80601e+09
Grid : Message : 355.827553 s : Average mflops/s per call (full) : 1.54043e+08
Grid : Message : 355.827555 s : Average mflops/s per call per rank (full): 2.40692e+06
Grid : Message : 355.827559 s : Average mflops/s per call per node (full): 9.6277e+06
Grid : Message : 355.827561 s : WilsonFermion5D Stencil
Grid : Message : 355.827563 s : WilsonFermion5D StencilEven
Grid : Message : 355.827564 s : WilsonFermion5D StencilOdd
Grid : Message : 355.827569 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 355.827571 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 355.827573 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 411.449084 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 411.449109 s : Called DwDag
Grid : Message : 411.449110 s : norm dag result 12.0421
Grid : Message : 411.469399 s : norm dag ref 12.0421
Grid : Message : 411.485218 s : norm dag diff 7.21924e-14
Grid : Message : 411.525702 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 411.934554 s : src_e0.499998
Grid : Message : 412.308595 s : src_o0.500002
Grid : Message : 412.416600 s : *********************************************************
Grid : Message : 412.416604 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 412.416605 s : * Vectorising space-time by 8
Grid : Message : 412.416606 s : * SINGLE precision
Grid : Message : 412.416607 s : * Using Overlapped Comms/Compute
Grid : Message : 412.416608 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 412.416609 s : *********************************************************
Grid : Message : 550.472923 s : Deo mflop/s = 1.54056e+08
Grid : Message : 550.472960 s : Deo mflop/s per rank 2.40712e+06
Grid : Message : 550.472962 s : Deo mflop/s per node 9.6285e+06
Grid : Message : 550.472965 s : #### Dhop calls report
Grid : Message : 550.472967 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 550.472969 s : WilsonFermion5D TotalTime /Calls : 4601.58 us
Grid : Message : 550.472972 s : WilsonFermion5D CommTime /Calls : 3044.3 us
Grid : Message : 550.472974 s : WilsonFermion5D FaceTime /Calls : 582.772 us
Grid : Message : 550.472976 s : WilsonFermion5D ComputeTime1/Calls : 6.03287 us
Grid : Message : 550.472979 s : WilsonFermion5D ComputeTime2/Calls : 1000.92 us
Grid : Message : 550.473006 s : Average mflops/s per call : 1.03512e+11
Grid : Message : 550.473012 s : Average mflops/s per call per rank : 1.61737e+09
Grid : Message : 550.473015 s : Average mflops/s per call per node : 6.46949e+09
Grid : Message : 550.473017 s : Average mflops/s per call (full) : 1.56806e+08
Grid : Message : 550.473023 s : Average mflops/s per call per rank (full): 2.45009e+06
Grid : Message : 550.473026 s : Average mflops/s per call per node (full): 9.80036e+06
Grid : Message : 550.473030 s : WilsonFermion5D Stencil
Grid : Message : 550.473033 s : WilsonFermion5D StencilEven
Grid : Message : 550.473035 s : WilsonFermion5D StencilOdd
Grid : Message : 550.473038 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 550.473040 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 550.473043 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 550.546695 s : r_e6.02108
Grid : Message : 550.553572 s : r_o6.02101
Grid : Message : 550.559924 s : res12.0421
Grid : Message : 551.230295 s : norm diff 0
Grid : Message : 552.839070 s : norm diff even 0
Grid : Message : 552.554967 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1110
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1110
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:22:53 BST 2022
epoch 1661221373

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:44:38 BST 2022
epoch 1661222678

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffd3170b000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ac04bcc000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ac04804000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ac04312000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ac03fe8000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ac03d07000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ac03aa6000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ac04b53000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ac036c6000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ac01f6a000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ac01b9a000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ac018f9000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ac017ce000)
libm.so.6 => /lib64/libm.so.6 (0x000014ac0144c000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ac01215000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ac00ffd000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ac00ddd000)
libc.so.6 => /lib64/libc.so.6 (0x000014ac00a18000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014ac00814000)
/lib64/ld-linux-x86-64.so.2 (0x000014ac04a1c000)
librt.so.1 => /lib64/librt.so.1 (0x000014ac0060c000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ac04a87000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ac04a82000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ac00500000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ac002f6000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014ac000f2000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14bc00000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.741388 s : Grid Layout
Grid : Message : 1.741393 s : Global lattice size : 64 64 64 256
Grid : Message : 1.741398 s : OpenMP threads : 4
Grid : Message : 1.741399 s : MPI tasks : 2 2 2 8
Grid : Message : 1.779973 s : Making s innermost grids
Grid : Message : 1.841238 s : Initialising 4d RNG
Grid : Message : 1.936538 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.936565 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.514090 s : Initialising 5d RNG
Grid : Message : 4.953826 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.953867 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 38.142286 s : Initialised RNGs
Grid : Message : 42.283705 s : Drawing gauge field
Grid : Message : 43.103984 s : Random gauge initialised
Grid : Message : 43.120000 s : Setting up Cshift based reference
Grid : Message : 71.998584 s : *****************************************************************
Grid : Message : 71.998611 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 71.998613 s : *****************************************************************
Grid : Message : 71.998614 s : *****************************************************************
Grid : Message : 71.998615 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 71.998616 s : * Vectorising space-time by 8
Grid : Message : 71.998617 s : * VComplexF size is 64 B
Grid : Message : 71.998618 s : * SINGLE precision
Grid : Message : 71.998621 s : * Using Overlapped Comms/Compute
Grid : Message : 71.998622 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 71.998623 s : *****************************************************************
Grid : Message : 74.860610 s : Called warmup
Grid : Message : 354.501585 s : Called Dw 30000 times in 2.80414e+08 us
Grid : Message : 354.501646 s : mflop/s = 1.51633e+08
Grid : Message : 354.501649 s : mflop/s per rank = 2.36927e+06
Grid : Message : 354.501651 s : mflop/s per node = 9.47708e+06
Grid : Message : 354.501653 s : RF GiB/s (base 2) = 308115
Grid : Message : 354.501655 s : mem GiB/s (base 2) = 192572
Grid : Message : 354.505161 s : norm diff 1.06407e-13
Grid : Message : 354.553410 s : #### Dhop calls report
Grid : Message : 354.553418 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 354.553422 s : WilsonFermion5D TotalTime /Calls : 4677.53 us
Grid : Message : 354.553424 s : WilsonFermion5D CommTime /Calls : 3194.52 us
Grid : Message : 354.553426 s : WilsonFermion5D FaceTime /Calls : 480.197 us
Grid : Message : 354.553428 s : WilsonFermion5D ComputeTime1/Calls : 5.0421 us
Grid : Message : 354.553430 s : WilsonFermion5D ComputeTime2/Calls : 1015.76 us
Grid : Message : 354.553528 s : Average mflops/s per call : 1.2287e+11
Grid : Message : 354.553532 s : Average mflops/s per call per rank : 1.91984e+09
Grid : Message : 354.553534 s : Average mflops/s per call per node : 7.67937e+09
Grid : Message : 354.553536 s : Average mflops/s per call (full) : 1.5426e+08
Grid : Message : 354.553538 s : Average mflops/s per call per rank (full): 2.41031e+06
Grid : Message : 354.553540 s : Average mflops/s per call per node (full): 9.64123e+06
Grid : Message : 354.553542 s : WilsonFermion5D Stencil
Grid : Message : 354.553543 s : WilsonFermion5D StencilEven
Grid : Message : 354.553544 s : WilsonFermion5D StencilOdd
Grid : Message : 354.553545 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 354.553546 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 354.553547 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 409.963064 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 409.963090 s : Called DwDag
Grid : Message : 409.963091 s : norm dag result 12.0421
Grid : Message : 409.997480 s : norm dag ref 12.0421
Grid : Message : 410.132270 s : norm dag diff 7.21924e-14
Grid : Message : 410.545350 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 410.417445 s : src_e0.499998
Grid : Message : 410.871395 s : src_o0.500002
Grid : Message : 411.107600 s : *********************************************************
Grid : Message : 411.107650 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 411.107660 s : * Vectorising space-time by 8
Grid : Message : 411.107670 s : * SINGLE precision
Grid : Message : 411.107680 s : * Using Overlapped Comms/Compute
Grid : Message : 411.107690 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 411.107700 s : *********************************************************
Grid : Message : 548.471947 s : Deo mflop/s = 1.5473e+08
Grid : Message : 548.471984 s : Deo mflop/s per rank 2.41766e+06
Grid : Message : 548.471986 s : Deo mflop/s per node 9.67065e+06
Grid : Message : 548.471989 s : #### Dhop calls report
Grid : Message : 548.471991 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 548.471993 s : WilsonFermion5D TotalTime /Calls : 4581.78 us
Grid : Message : 548.471995 s : WilsonFermion5D CommTime /Calls : 3018.83 us
Grid : Message : 548.471997 s : WilsonFermion5D FaceTime /Calls : 580 us
Grid : Message : 548.471999 s : WilsonFermion5D ComputeTime1/Calls : 6.02087 us
Grid : Message : 548.472001 s : WilsonFermion5D ComputeTime2/Calls : 1008.22 us
Grid : Message : 548.472021 s : Average mflops/s per call : 1.02165e+11
Grid : Message : 548.472025 s : Average mflops/s per call per rank : 1.59633e+09
Grid : Message : 548.472027 s : Average mflops/s per call per node : 6.38532e+09
Grid : Message : 548.472030 s : Average mflops/s per call (full) : 1.57483e+08
Grid : Message : 548.472032 s : Average mflops/s per call per rank (full): 2.46068e+06
Grid : Message : 548.472035 s : Average mflops/s per call per node (full): 9.84271e+06
Grid : Message : 548.472037 s : WilsonFermion5D Stencil
Grid : Message : 548.472038 s : WilsonFermion5D StencilEven
Grid : Message : 548.472039 s : WilsonFermion5D StencilOdd
Grid : Message : 548.472041 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 548.472042 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 548.472045 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 548.546943 s : r_e6.02108
Grid : Message : 548.553846 s : r_o6.02101
Grid : Message : 548.560197 s : res12.0421
Grid : Message : 549.240929 s : norm diff 0
Grid : Message : 550.799580 s : norm diff even 0
Grid : Message : 550.551633 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1125
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1125
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:35:17 BST 2022
epoch 1661222117

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:56:59 BST 2022
epoch 1661223419

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe4a10b000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e1f7395000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e1f6fcd000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e1f6adb000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e1f67b1000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e1f64d0000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e1f626f000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e1f731c000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e1f5e8f000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e1f4733000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e1f4363000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e1f40c2000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e1f3f97000)
libm.so.6 => /lib64/libm.so.6 (0x000014e1f3c15000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e1f39de000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e1f37c6000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e1f35a6000)
libc.so.6 => /lib64/libc.so.6 (0x000014e1f31e1000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014e1f2fdd000)
/lib64/ld-linux-x86-64.so.2 (0x000014e1f71e5000)
librt.so.1 => /lib64/librt.so.1 (0x000014e1f2dd5000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e1f7250000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e1f724b000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e1f2cc9000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e1f2abf000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014e1f28bb000)

View File

@ -0,0 +1,286 @@
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x151da0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.624935 s : Grid Layout
Grid : Message : 1.624939 s : Global lattice size : 64 64 64 256
Grid : Message : 1.624944 s : OpenMP threads : 4
Grid : Message : 1.624946 s : MPI tasks : 2 2 2 8
Grid : Message : 1.665490 s : Making s innermost grids
Grid : Message : 1.724722 s : Initialising 4d RNG
Grid : Message : 1.820577 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.820601 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 3.397501 s : Initialising 5d RNG
Grid : Message : 4.840410 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.840450 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 38.264300 s : Initialised RNGs
Grid : Message : 42.777074 s : Drawing gauge field
Grid : Message : 43.619715 s : Random gauge initialised
Grid : Message : 43.632921 s : Setting up Cshift based reference
Grid : Message : 72.511474 s : *****************************************************************
Grid : Message : 72.511512 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 72.511514 s : *****************************************************************
Grid : Message : 72.511515 s : *****************************************************************
Grid : Message : 72.511516 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 72.511518 s : * Vectorising space-time by 8
Grid : Message : 72.511520 s : * VComplexF size is 64 B
Grid : Message : 72.511522 s : * SINGLE precision
Grid : Message : 72.511525 s : * Using Overlapped Comms/Compute
Grid : Message : 72.511529 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 72.511531 s : *****************************************************************
Grid : Message : 74.532358 s : Called warmup
Grid : Message : 354.723946 s : Called Dw 30000 times in 2.8019e+08 us
Grid : Message : 354.723995 s : mflop/s = 1.51755e+08
Grid : Message : 354.723997 s : mflop/s per rank = 2.37116e+06
Grid : Message : 354.723999 s : mflop/s per node = 9.48466e+06
Grid : Message : 354.724001 s : RF GiB/s (base 2) = 308362
Grid : Message : 354.724003 s : mem GiB/s (base 2) = 192726
Grid : Message : 354.727509 s : norm diff 1.06407e-13
Grid : Message : 354.777181 s : #### Dhop calls report
Grid : Message : 354.777188 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 354.777192 s : WilsonFermion5D TotalTime /Calls : 4673.52 us
Grid : Message : 354.777194 s : WilsonFermion5D CommTime /Calls : 3186.13 us
Grid : Message : 354.777196 s : WilsonFermion5D FaceTime /Calls : 478.836 us
Grid : Message : 354.777198 s : WilsonFermion5D ComputeTime1/Calls : 4.98882 us
Grid : Message : 354.777200 s : WilsonFermion5D ComputeTime2/Calls : 1022.19 us
Grid : Message : 354.777269 s : Average mflops/s per call : 1.252e+11
Grid : Message : 354.777272 s : Average mflops/s per call per rank : 1.95624e+09
Grid : Message : 354.777274 s : Average mflops/s per call per node : 7.82497e+09
Grid : Message : 354.777276 s : Average mflops/s per call (full) : 1.54392e+08
Grid : Message : 354.777279 s : Average mflops/s per call per rank (full): 2.41237e+06
Grid : Message : 354.777281 s : Average mflops/s per call per node (full): 9.6495e+06
Grid : Message : 354.777283 s : WilsonFermion5D Stencil
Grid : Message : 354.777284 s : WilsonFermion5D StencilEven
Grid : Message : 354.777286 s : WilsonFermion5D StencilOdd
Grid : Message : 354.777287 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 354.777289 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 354.777290 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 410.641840 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 410.642230 s : Called DwDag
Grid : Message : 410.642250 s : norm dag result 12.0421
Grid : Message : 410.776270 s : norm dag ref 12.0421
Grid : Message : 410.933470 s : norm dag diff 7.21924e-14
Grid : Message : 410.141942 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 410.561423 s : src_e0.499998
Grid : Message : 410.986491 s : src_o0.500002
Grid : Message : 411.130524 s : *********************************************************
Grid : Message : 411.130528 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 411.130530 s : * Vectorising space-time by 8
Grid : Message : 411.130532 s : * SINGLE precision
Grid : Message : 411.130534 s : * Using Overlapped Comms/Compute
Grid : Message : 411.130536 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 411.130537 s : *********************************************************
Grid : Message : 548.109968 s : Deo mflop/s = 1.55267e+08
Grid : Message : 548.110008 s : Deo mflop/s per rank 2.42604e+06
Grid : Message : 548.110010 s : Deo mflop/s per node 9.70418e+06
Grid : Message : 548.110013 s : #### Dhop calls report
Grid : Message : 548.110015 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 548.110018 s : WilsonFermion5D TotalTime /Calls : 4565.66 us
Grid : Message : 548.110020 s : WilsonFermion5D CommTime /Calls : 2996.68 us
Grid : Message : 548.110022 s : WilsonFermion5D FaceTime /Calls : 574.72 us
Grid : Message : 548.110024 s : WilsonFermion5D ComputeTime1/Calls : 6.0247 us
Grid : Message : 548.110026 s : WilsonFermion5D ComputeTime2/Calls : 1020.13 us
Grid : Message : 548.110050 s : Average mflops/s per call : 1.03225e+11
Grid : Message : 548.110054 s : Average mflops/s per call per rank : 1.6129e+09
Grid : Message : 548.110056 s : Average mflops/s per call per node : 6.45159e+09
Grid : Message : 548.110060 s : Average mflops/s per call (full) : 1.58039e+08
Grid : Message : 548.110063 s : Average mflops/s per call per rank (full): 2.46937e+06
Grid : Message : 548.110067 s : Average mflops/s per call per node (full): 9.87746e+06
Grid : Message : 548.110072 s : WilsonFermion5D Stencil
Grid : Message : 548.110074 s : WilsonFermion5D StencilEven
Grid : Message : 548.110076 s : WilsonFermion5D StencilOdd
Grid : Message : 548.110078 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 548.110080 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 548.110087 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 548.182456 s : r_e6.02108
Grid : Message : 548.190125 s : r_o6.02101
Grid : Message : 548.196449 s : res12.0421
Grid : Message : 548.875188 s : norm diff 0
Grid : Message : 549.798793 s : norm diff even 0
Grid : Message : 550.237927 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-16A-1140
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1140
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 64.64.64.256 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:47:39 BST 2022
epoch 1661222859

Some files were not shown because too many files have changed in this diff Show More