Initial commit

This commit is contained in:
2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:25:04 BST 2022
epoch 1661023504

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc4036a000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000147189068000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000147188ca0000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001471887ae000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000147188484000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001471881a3000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000147187f42000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000147188fef000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000147187b62000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000147186406000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000147186036000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000147185d95000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000147185c6a000)
libm.so.6 => /lib64/libm.so.6 (0x00001471858e8000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001471856b1000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000147185499000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000147185279000)
libc.so.6 => /lib64/libc.so.6 (0x0000147184eb4000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000147184cb0000)
/lib64/ld-linux-x86-64.so.2 (0x0000147188eb8000)
librt.so.1 => /lib64/librt.so.1 (0x0000147184aa8000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000147188f23000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000147188f1e000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014718499c000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000147184792000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014718458e000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14cfc0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.291243 s : Grid Layout
Grid : Message : 1.291247 s : Global lattice size : 48 48 48 48
Grid : Message : 1.291254 s : OpenMP threads : 4
Grid : Message : 1.291258 s : MPI tasks : 2 2 2 4
Grid : Message : 1.303822 s : Making s innermost grids
Grid : Message : 1.320388 s : Initialising 4d RNG
Grid : Message : 1.336702 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.336725 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.536145 s : Initialising 5d RNG
Grid : Message : 1.776849 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.776873 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.482939 s : Initialised RNGs
Grid : Message : 5.341477 s : Drawing gauge field
Grid : Message : 5.450363 s : Random gauge initialised
Grid : Message : 5.454302 s : Setting up Cshift based reference
Grid : Message : 10.483446 s : *****************************************************************
Grid : Message : 10.483466 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.483468 s : *****************************************************************
Grid : Message : 10.483469 s : *****************************************************************
Grid : Message : 10.483470 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.483471 s : * Vectorising space-time by 8
Grid : Message : 10.483472 s : * VComplexF size is 64 B
Grid : Message : 10.483473 s : * SINGLE precision
Grid : Message : 10.483474 s : * Using Overlapped Comms/Compute
Grid : Message : 10.483475 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.483476 s : *****************************************************************
Grid : Message : 11.249590 s : Called warmup
Grid : Message : 100.539489 s : Called Dw 30000 times in 8.95142e+07 us
Grid : Message : 100.539542 s : mflop/s = 3.75741e+07
Grid : Message : 100.539544 s : mflop/s per rank = 1.17419e+06
Grid : Message : 100.539546 s : mflop/s per node = 4.69676e+06
Grid : Message : 100.539548 s : RF GiB/s (base 2) = 76349.6
Grid : Message : 100.539550 s : mem GiB/s (base 2) = 47718.5
Grid : Message : 100.540119 s : norm diff 1.05759e-13
Grid : Message : 100.549682 s : #### Dhop calls report
Grid : Message : 100.549689 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 100.549693 s : WilsonFermion5D TotalTime /Calls : 1493.26 us
Grid : Message : 100.549695 s : WilsonFermion5D CommTime /Calls : 1049.79 us
Grid : Message : 100.549697 s : WilsonFermion5D FaceTime /Calls : 219.441 us
Grid : Message : 100.549699 s : WilsonFermion5D ComputeTime1/Calls : 2.73708 us
Grid : Message : 100.549701 s : WilsonFermion5D ComputeTime2/Calls : 236.764 us
Grid : Message : 100.549730 s : Average mflops/s per call : 1.77575e+10
Grid : Message : 100.549734 s : Average mflops/s per call per rank : 5.54921e+08
Grid : Message : 100.549736 s : Average mflops/s per call per node : 2.21968e+09
Grid : Message : 100.549738 s : Average mflops/s per call (full) : 3.82224e+07
Grid : Message : 100.549741 s : Average mflops/s per call per rank (full): 1.19445e+06
Grid : Message : 100.549743 s : Average mflops/s per call per node (full): 4.7778e+06
Grid : Message : 100.549745 s : WilsonFermion5D Stencil
Grid : Message : 100.549746 s : WilsonFermion5D StencilEven
Grid : Message : 100.549749 s : WilsonFermion5D StencilOdd
Grid : Message : 100.549750 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 100.549754 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 100.549757 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 109.252306 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 109.252327 s : Called DwDag
Grid : Message : 109.252328 s : norm dag result 12.0422
Grid : Message : 109.255491 s : norm dag ref 12.0422
Grid : Message : 109.258528 s : norm dag diff 7.13141e-14
Grid : Message : 109.270823 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 109.336420 s : src_e0.499992
Grid : Message : 109.408759 s : src_o0.500008
Grid : Message : 109.425239 s : *********************************************************
Grid : Message : 109.425244 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 109.425246 s : * Vectorising space-time by 8
Grid : Message : 109.425247 s : * SINGLE precision
Grid : Message : 109.425249 s : * Using Overlapped Comms/Compute
Grid : Message : 109.425251 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 109.425252 s : *********************************************************
Grid : Message : 157.753385 s : Deo mflop/s = 3.4805e+07
Grid : Message : 157.753416 s : Deo mflop/s per rank 1.08766e+06
Grid : Message : 157.753418 s : Deo mflop/s per node 4.35063e+06
Grid : Message : 157.753421 s : #### Dhop calls report
Grid : Message : 157.753423 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 157.753426 s : WilsonFermion5D TotalTime /Calls : 1610.74 us
Grid : Message : 157.753428 s : WilsonFermion5D CommTime /Calls : 1101.75 us
Grid : Message : 157.753430 s : WilsonFermion5D FaceTime /Calls : 290.394 us
Grid : Message : 157.753432 s : WilsonFermion5D ComputeTime1/Calls : 4.75421 us
Grid : Message : 157.753434 s : WilsonFermion5D ComputeTime2/Calls : 242.784 us
Grid : Message : 157.753456 s : Average mflops/s per call : 1.02081e+10
Grid : Message : 157.753460 s : Average mflops/s per call per rank : 3.19003e+08
Grid : Message : 157.753462 s : Average mflops/s per call per node : 1.27601e+09
Grid : Message : 157.753464 s : Average mflops/s per call (full) : 3.54347e+07
Grid : Message : 157.753467 s : Average mflops/s per call per rank (full): 1.10733e+06
Grid : Message : 157.753469 s : Average mflops/s per call per node (full): 4.42934e+06
Grid : Message : 157.753472 s : WilsonFermion5D Stencil
Grid : Message : 157.753473 s : WilsonFermion5D StencilEven
Grid : Message : 157.753476 s : WilsonFermion5D StencilOdd
Grid : Message : 157.753478 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 157.753479 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 157.753481 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 157.773486 s : r_e6.02129
Grid : Message : 157.775479 s : r_o6.02097
Grid : Message : 157.776926 s : res12.0423
Grid : Message : 157.891008 s : norm diff 0
Grid : Message : 158.245750 s : norm diff even 0
Grid : Message : 158.961270 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1005
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1005
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:22:20 BST 2022
epoch 1661023340

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:37:28 BST 2022
epoch 1661024248

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc9ffef000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ec1aeaa000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ec1aae2000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ec1a5f0000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ec1a2c6000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ec19fe5000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ec19d84000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ec1ae31000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ec199a4000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ec18248000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ec17e78000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ec17bd7000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ec17aac000)
libm.so.6 => /lib64/libm.so.6 (0x000014ec1772a000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ec174f3000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ec172db000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ec170bb000)
libc.so.6 => /lib64/libc.so.6 (0x000014ec16cf6000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014ec16af2000)
/lib64/ld-linux-x86-64.so.2 (0x000014ec1acfa000)
librt.so.1 => /lib64/librt.so.1 (0x000014ec168ea000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ec1ad65000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ec1ad60000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ec167de000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ec165d4000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014ec163d0000)

View File

@ -0,0 +1,254 @@
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f0c0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.274727 s : Grid Layout
Grid : Message : 1.274731 s : Global lattice size : 48 48 48 48
Grid : Message : 1.274738 s : OpenMP threads : 4
Grid : Message : 1.274742 s : MPI tasks : 2 2 2 4
Grid : Message : 1.286239 s : Making s innermost grids
Grid : Message : 1.296640 s : Initialising 4d RNG
Grid : Message : 1.313085 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.313104 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.438915 s : Initialising 5d RNG
Grid : Message : 1.670684 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.670710 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.517605 s : Initialised RNGs
Grid : Message : 5.165082 s : Drawing gauge field
Grid : Message : 5.272845 s : Random gauge initialised
Grid : Message : 5.287691 s : Setting up Cshift based reference
Grid : Message : 10.356424 s : *****************************************************************
Grid : Message : 10.356441 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.356442 s : *****************************************************************
Grid : Message : 10.356443 s : *****************************************************************
Grid : Message : 10.356444 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.356445 s : * Vectorising space-time by 8
Grid : Message : 10.356447 s : * VComplexF size is 64 B
Grid : Message : 10.356448 s : * SINGLE precision
Grid : Message : 10.356449 s : * Using Overlapped Comms/Compute
Grid : Message : 10.356450 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.356451 s : *****************************************************************
Grid : Message : 10.894078 s : Called warmup
Grid : Message : 99.665065 s : Called Dw 30000 times in 8.8771e+07 us
Grid : Message : 99.665118 s : mflop/s = 3.78887e+07
Grid : Message : 99.665120 s : mflop/s per rank = 1.18402e+06
Grid : Message : 99.665122 s : mflop/s per node = 4.73608e+06
Grid : Message : 99.665124 s : RF GiB/s (base 2) = 76988.9
Grid : Message : 99.665126 s : mem GiB/s (base 2) = 48118
Grid : Message : 99.665697 s : norm diff 1.05759e-13
Grid : Message : 99.675870 s : #### Dhop calls report
Grid : Message : 99.675877 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 99.675880 s : WilsonFermion5D TotalTime /Calls : 1480.69 us
Grid : Message : 99.675882 s : WilsonFermion5D CommTime /Calls : 1038.97 us
Grid : Message : 99.675885 s : WilsonFermion5D FaceTime /Calls : 219.112 us
Grid : Message : 99.675887 s : WilsonFermion5D ComputeTime1/Calls : 2.79427 us
Grid : Message : 99.675889 s : WilsonFermion5D ComputeTime2/Calls : 235.635 us
Grid : Message : 99.675899 s : Average mflops/s per call : 1.78613e+10
Grid : Message : 99.675906 s : Average mflops/s per call per rank : 5.58166e+08
Grid : Message : 99.675909 s : Average mflops/s per call per node : 2.23266e+09
Grid : Message : 99.675911 s : Average mflops/s per call (full) : 3.85468e+07
Grid : Message : 99.675914 s : Average mflops/s per call per rank (full): 1.20459e+06
Grid : Message : 99.675917 s : Average mflops/s per call per node (full): 4.81836e+06
Grid : Message : 99.675920 s : WilsonFermion5D Stencil
Grid : Message : 99.675921 s : WilsonFermion5D StencilEven
Grid : Message : 99.675922 s : WilsonFermion5D StencilOdd
Grid : Message : 99.675924 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 99.675929 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 99.675930 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 108.331185 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 108.331205 s : Called DwDag
Grid : Message : 108.331206 s : norm dag result 12.0422
Grid : Message : 108.333524 s : norm dag ref 12.0422
Grid : Message : 108.336555 s : norm dag diff 7.13141e-14
Grid : Message : 108.347667 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 108.409420 s : src_e0.499992
Grid : Message : 108.483354 s : src_o0.500008
Grid : Message : 108.500169 s : *********************************************************
Grid : Message : 108.500173 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 108.500175 s : * Vectorising space-time by 8
Grid : Message : 108.500176 s : * SINGLE precision
Grid : Message : 108.500177 s : * Using Overlapped Comms/Compute
Grid : Message : 108.500178 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 108.500179 s : *********************************************************
Grid : Message : 156.376888 s : Deo mflop/s = 3.51332e+07
Grid : Message : 156.376919 s : Deo mflop/s per rank 1.09791e+06
Grid : Message : 156.376921 s : Deo mflop/s per node 4.39165e+06
Grid : Message : 156.376924 s : #### Dhop calls report
Grid : Message : 156.376926 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 156.376929 s : WilsonFermion5D TotalTime /Calls : 1595.69 us
Grid : Message : 156.376931 s : WilsonFermion5D CommTime /Calls : 1087.54 us
Grid : Message : 156.376933 s : WilsonFermion5D FaceTime /Calls : 292.342 us
Grid : Message : 156.376935 s : WilsonFermion5D ComputeTime1/Calls : 4.75321 us
Grid : Message : 156.376937 s : WilsonFermion5D ComputeTime2/Calls : 240.424 us
Grid : Message : 156.376963 s : Average mflops/s per call : 1.02133e+10
Grid : Message : 156.376967 s : Average mflops/s per call per rank : 3.19165e+08
Grid : Message : 156.376970 s : Average mflops/s per call per node : 1.27666e+09
Grid : Message : 156.376975 s : Average mflops/s per call (full) : 3.57688e+07
Grid : Message : 156.376979 s : Average mflops/s per call per rank (full): 1.11778e+06
Grid : Message : 156.376984 s : Average mflops/s per call per node (full): 4.4711e+06
Grid : Message : 156.376988 s : WilsonFermion5D Stencil
Grid : Message : 156.376990 s : WilsonFermion5D StencilEven
Grid : Message : 156.376991 s : WilsonFermion5D StencilOdd
Grid : Message : 156.376994 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 156.376996 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 156.376998 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 156.396805 s : r_e6.02129
Grid : Message : 156.398572 s : r_o6.02097
Grid : Message : 156.400042 s : res12.0423
Grid : Message : 156.511360 s : norm diff 0
Grid : Message : 156.646367 s : norm diff even 0
Grid : Message : 156.715079 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1020
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1020
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:34:46 BST 2022
epoch 1661024086

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:43:19 BST 2022
epoch 1661024599

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffda5149000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000147cdc012000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000147cdbc4a000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000147cdb758000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000147cdb42e000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000147cdb14d000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000147cdaeec000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000147cdbf99000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000147cdab0c000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000147cd93b0000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000147cd8fe0000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000147cd8d3f000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000147cd8c14000)
libm.so.6 => /lib64/libm.so.6 (0x0000147cd8892000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000147cd865b000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000147cd8443000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000147cd8223000)
libc.so.6 => /lib64/libc.so.6 (0x0000147cd7e5e000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000147cd7c5a000)
/lib64/ld-linux-x86-64.so.2 (0x0000147cdbe62000)
librt.so.1 => /lib64/librt.so.1 (0x0000147cd7a52000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000147cdbecd000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000147cdbec8000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000147cd7946000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000147cd773c000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000147cd7538000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150480000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.488326 s : Grid Layout
Grid : Message : 1.488330 s : Global lattice size : 48 48 48 48
Grid : Message : 1.488336 s : OpenMP threads : 4
Grid : Message : 1.488339 s : MPI tasks : 2 2 2 4
Grid : Message : 1.502272 s : Making s innermost grids
Grid : Message : 1.518383 s : Initialising 4d RNG
Grid : Message : 1.534282 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.534304 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.779780 s : Initialising 5d RNG
Grid : Message : 2.102130 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.102560 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.823411 s : Initialised RNGs
Grid : Message : 5.679533 s : Drawing gauge field
Grid : Message : 5.765020 s : Random gauge initialised
Grid : Message : 5.769069 s : Setting up Cshift based reference
Grid : Message : 10.830431 s : *****************************************************************
Grid : Message : 10.830449 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.830451 s : *****************************************************************
Grid : Message : 10.830452 s : *****************************************************************
Grid : Message : 10.830453 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.830454 s : * Vectorising space-time by 8
Grid : Message : 10.830455 s : * VComplexF size is 64 B
Grid : Message : 10.830456 s : * SINGLE precision
Grid : Message : 10.830457 s : * Using Overlapped Comms/Compute
Grid : Message : 10.830458 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.830459 s : *****************************************************************
Grid : Message : 11.332763 s : Called warmup
Grid : Message : 99.153092 s : Called Dw 30000 times in 8.78201e+07 us
Grid : Message : 99.153144 s : mflop/s = 3.82989e+07
Grid : Message : 99.153146 s : mflop/s per rank = 1.19684e+06
Grid : Message : 99.153148 s : mflop/s per node = 4.78736e+06
Grid : Message : 99.153150 s : RF GiB/s (base 2) = 77822.4
Grid : Message : 99.153152 s : mem GiB/s (base 2) = 48639
Grid : Message : 99.153722 s : norm diff 1.05759e-13
Grid : Message : 99.164069 s : #### Dhop calls report
Grid : Message : 99.164076 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 99.164079 s : WilsonFermion5D TotalTime /Calls : 1464.53 us
Grid : Message : 99.164081 s : WilsonFermion5D CommTime /Calls : 1021.47 us
Grid : Message : 99.164083 s : WilsonFermion5D FaceTime /Calls : 219.776 us
Grid : Message : 99.164085 s : WilsonFermion5D ComputeTime1/Calls : 2.8622 us
Grid : Message : 99.164087 s : WilsonFermion5D ComputeTime2/Calls : 235.73 us
Grid : Message : 99.164105 s : Average mflops/s per call : 1.77625e+10
Grid : Message : 99.164108 s : Average mflops/s per call per rank : 5.55077e+08
Grid : Message : 99.164110 s : Average mflops/s per call per node : 2.22031e+09
Grid : Message : 99.164116 s : Average mflops/s per call (full) : 3.89722e+07
Grid : Message : 99.164119 s : Average mflops/s per call per rank (full): 1.21788e+06
Grid : Message : 99.164121 s : Average mflops/s per call per node (full): 4.87153e+06
Grid : Message : 99.164123 s : WilsonFermion5D Stencil
Grid : Message : 99.164126 s : WilsonFermion5D StencilEven
Grid : Message : 99.164127 s : WilsonFermion5D StencilOdd
Grid : Message : 99.164129 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 99.164131 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 99.164132 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 107.831263 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 107.831285 s : Called DwDag
Grid : Message : 107.831286 s : norm dag result 12.0422
Grid : Message : 107.843943 s : norm dag ref 12.0422
Grid : Message : 107.846918 s : norm dag diff 7.13141e-14
Grid : Message : 107.859773 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 107.920803 s : src_e0.499992
Grid : Message : 107.999399 s : src_o0.500008
Grid : Message : 108.158950 s : *********************************************************
Grid : Message : 108.158990 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 108.159010 s : * Vectorising space-time by 8
Grid : Message : 108.159020 s : * SINGLE precision
Grid : Message : 108.159030 s : * Using Overlapped Comms/Compute
Grid : Message : 108.159040 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 108.159050 s : *********************************************************
Grid : Message : 155.299677 s : Deo mflop/s = 3.55746e+07
Grid : Message : 155.299707 s : Deo mflop/s per rank 1.11171e+06
Grid : Message : 155.299709 s : Deo mflop/s per node 4.44682e+06
Grid : Message : 155.299712 s : #### Dhop calls report
Grid : Message : 155.299714 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 155.299716 s : WilsonFermion5D TotalTime /Calls : 1575.96 us
Grid : Message : 155.299718 s : WilsonFermion5D CommTime /Calls : 1069.71 us
Grid : Message : 155.299720 s : WilsonFermion5D FaceTime /Calls : 290.248 us
Grid : Message : 155.299722 s : WilsonFermion5D ComputeTime1/Calls : 5.07466 us
Grid : Message : 155.299724 s : WilsonFermion5D ComputeTime2/Calls : 240.561 us
Grid : Message : 155.299743 s : Average mflops/s per call : 1.0125e+10
Grid : Message : 155.299747 s : Average mflops/s per call per rank : 3.16406e+08
Grid : Message : 155.299749 s : Average mflops/s per call per node : 1.26562e+09
Grid : Message : 155.299751 s : Average mflops/s per call (full) : 3.62167e+07
Grid : Message : 155.299755 s : Average mflops/s per call per rank (full): 1.13177e+06
Grid : Message : 155.299757 s : Average mflops/s per call per node (full): 4.52709e+06
Grid : Message : 155.299760 s : WilsonFermion5D Stencil
Grid : Message : 155.299761 s : WilsonFermion5D StencilEven
Grid : Message : 155.299764 s : WilsonFermion5D StencilOdd
Grid : Message : 155.299765 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 155.299769 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 155.299771 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 155.318224 s : r_e6.02129
Grid : Message : 155.320491 s : r_o6.02097
Grid : Message : 155.321893 s : res12.0423
Grid : Message : 155.423019 s : norm diff 0
Grid : Message : 155.571243 s : norm diff even 0
Grid : Message : 155.646003 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1035
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1035
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:40:37 BST 2022
epoch 1661024437

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:49:07 BST 2022
epoch 1661024947

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc42f61000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014983c344000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014983bf7c000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014983ba8a000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014983b760000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014983b47f000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014983b21e000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014983c2cb000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014983ae3e000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001498396e2000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000149839312000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000149839071000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000149838f46000)
libm.so.6 => /lib64/libm.so.6 (0x0000149838bc4000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014983898d000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000149838775000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000149838555000)
libc.so.6 => /lib64/libc.so.6 (0x0000149838190000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000149837f8c000)
/lib64/ld-linux-x86-64.so.2 (0x000014983c194000)
librt.so.1 => /lib64/librt.so.1 (0x0000149837d84000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014983c1ff000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014983c1fa000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000149837c78000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000149837a6e000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014983786a000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d20000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.296325 s : Grid Layout
Grid : Message : 1.296329 s : Global lattice size : 48 48 48 48
Grid : Message : 1.296334 s : OpenMP threads : 4
Grid : Message : 1.296336 s : MPI tasks : 2 2 2 4
Grid : Message : 1.308991 s : Making s innermost grids
Grid : Message : 1.325119 s : Initialising 4d RNG
Grid : Message : 1.341243 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.341264 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.572667 s : Initialising 5d RNG
Grid : Message : 1.806486 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.806513 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.457170 s : Initialised RNGs
Grid : Message : 5.379782 s : Drawing gauge field
Grid : Message : 5.475278 s : Random gauge initialised
Grid : Message : 5.480285 s : Setting up Cshift based reference
Grid : Message : 10.637374 s : *****************************************************************
Grid : Message : 10.637392 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.637393 s : *****************************************************************
Grid : Message : 10.637394 s : *****************************************************************
Grid : Message : 10.637395 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.637396 s : * Vectorising space-time by 8
Grid : Message : 10.637397 s : * VComplexF size is 64 B
Grid : Message : 10.637398 s : * SINGLE precision
Grid : Message : 10.637399 s : * Using Overlapped Comms/Compute
Grid : Message : 10.637400 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.637401 s : *****************************************************************
Grid : Message : 11.209877 s : Called warmup
Grid : Message : 98.239599 s : Called Dw 30000 times in 8.70295e+07 us
Grid : Message : 98.239671 s : mflop/s = 3.86468e+07
Grid : Message : 98.239673 s : mflop/s per rank = 1.20771e+06
Grid : Message : 98.239675 s : mflop/s per node = 4.83085e+06
Grid : Message : 98.239677 s : RF GiB/s (base 2) = 78529.4
Grid : Message : 98.239679 s : mem GiB/s (base 2) = 49080.9
Grid : Message : 98.240251 s : norm diff 1.05759e-13
Grid : Message : 98.250051 s : #### Dhop calls report
Grid : Message : 98.250058 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 98.250061 s : WilsonFermion5D TotalTime /Calls : 1451.95 us
Grid : Message : 98.250063 s : WilsonFermion5D CommTime /Calls : 1009.67 us
Grid : Message : 98.250065 s : WilsonFermion5D FaceTime /Calls : 219.662 us
Grid : Message : 98.250067 s : WilsonFermion5D ComputeTime1/Calls : 2.86259 us
Grid : Message : 98.250069 s : WilsonFermion5D ComputeTime2/Calls : 235.372 us
Grid : Message : 98.250147 s : Average mflops/s per call : 1.76785e+10
Grid : Message : 98.250151 s : Average mflops/s per call per rank : 5.52452e+08
Grid : Message : 98.250153 s : Average mflops/s per call per node : 2.20981e+09
Grid : Message : 98.250155 s : Average mflops/s per call (full) : 3.93098e+07
Grid : Message : 98.250157 s : Average mflops/s per call per rank (full): 1.22843e+06
Grid : Message : 98.250159 s : Average mflops/s per call per node (full): 4.91373e+06
Grid : Message : 98.250161 s : WilsonFermion5D Stencil
Grid : Message : 98.250162 s : WilsonFermion5D StencilEven
Grid : Message : 98.250163 s : WilsonFermion5D StencilOdd
Grid : Message : 98.250164 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 98.250165 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 98.250166 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 106.979591 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 106.979614 s : Called DwDag
Grid : Message : 106.979615 s : norm dag result 12.0422
Grid : Message : 106.986186 s : norm dag ref 12.0422
Grid : Message : 106.989233 s : norm dag diff 7.13141e-14
Grid : Message : 107.267400 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 107.684690 s : src_e0.499992
Grid : Message : 107.131208 s : src_o0.500008
Grid : Message : 107.147828 s : *********************************************************
Grid : Message : 107.147833 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 107.147834 s : * Vectorising space-time by 8
Grid : Message : 107.147836 s : * SINGLE precision
Grid : Message : 107.147837 s : * Using Overlapped Comms/Compute
Grid : Message : 107.147839 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 107.147840 s : *********************************************************
Grid : Message : 154.983680 s : Deo mflop/s = 3.58274e+07
Grid : Message : 154.984010 s : Deo mflop/s per rank 1.11961e+06
Grid : Message : 154.984030 s : Deo mflop/s per node 4.47843e+06
Grid : Message : 154.984060 s : #### Dhop calls report
Grid : Message : 154.984080 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 154.984100 s : WilsonFermion5D TotalTime /Calls : 1564.83 us
Grid : Message : 154.984120 s : WilsonFermion5D CommTime /Calls : 1057.75 us
Grid : Message : 154.984140 s : WilsonFermion5D FaceTime /Calls : 290.957 us
Grid : Message : 154.984160 s : WilsonFermion5D ComputeTime1/Calls : 5.01747 us
Grid : Message : 154.984180 s : WilsonFermion5D ComputeTime2/Calls : 240.039 us
Grid : Message : 154.984370 s : Average mflops/s per call : 1.01412e+10
Grid : Message : 154.984410 s : Average mflops/s per call per rank : 3.16914e+08
Grid : Message : 154.984430 s : Average mflops/s per call per node : 1.26766e+09
Grid : Message : 154.984450 s : Average mflops/s per call (full) : 3.64742e+07
Grid : Message : 154.984490 s : Average mflops/s per call per rank (full): 1.13982e+06
Grid : Message : 154.984510 s : Average mflops/s per call per node (full): 4.55927e+06
Grid : Message : 154.984530 s : WilsonFermion5D Stencil
Grid : Message : 154.984540 s : WilsonFermion5D StencilEven
Grid : Message : 154.984570 s : WilsonFermion5D StencilOdd
Grid : Message : 154.984590 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 154.984630 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 154.984660 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 154.116284 s : r_e6.02129
Grid : Message : 154.118064 s : r_o6.02097
Grid : Message : 154.119490 s : res12.0423
Grid : Message : 154.225189 s : norm diff 0
Grid : Message : 154.355387 s : norm diff even 0
Grid : Message : 154.439041 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1050
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1050
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:46:26 BST 2022
epoch 1661024786

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:54:56 BST 2022
epoch 1661025296

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007fff1dbee000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000146b8752d000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000146b87165000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000146b86c73000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000146b86949000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000146b86668000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000146b86407000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000146b874b4000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000146b86027000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000146b848cb000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000146b844fb000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000146b8425a000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000146b8412f000)
libm.so.6 => /lib64/libm.so.6 (0x0000146b83dad000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000146b83b76000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000146b8395e000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000146b8373e000)
libc.so.6 => /lib64/libc.so.6 (0x0000146b83379000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000146b83175000)
/lib64/ld-linux-x86-64.so.2 (0x0000146b8737d000)
librt.so.1 => /lib64/librt.so.1 (0x0000146b82f6d000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000146b873e8000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000146b873e3000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000146b82e61000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000146b82c57000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000146b82a53000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146980000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.339201 s : Grid Layout
Grid : Message : 1.339206 s : Global lattice size : 48 48 48 48
Grid : Message : 1.339210 s : OpenMP threads : 4
Grid : Message : 1.339212 s : MPI tasks : 2 2 2 4
Grid : Message : 1.351308 s : Making s innermost grids
Grid : Message : 1.363723 s : Initialising 4d RNG
Grid : Message : 1.381317 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.381342 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.507048 s : Initialising 5d RNG
Grid : Message : 1.737129 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.737157 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.404513 s : Initialised RNGs
Grid : Message : 5.265286 s : Drawing gauge field
Grid : Message : 5.412925 s : Random gauge initialised
Grid : Message : 5.422103 s : Setting up Cshift based reference
Grid : Message : 10.470693 s : *****************************************************************
Grid : Message : 10.470713 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.470714 s : *****************************************************************
Grid : Message : 10.470715 s : *****************************************************************
Grid : Message : 10.470716 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.470717 s : * Vectorising space-time by 8
Grid : Message : 10.470718 s : * VComplexF size is 64 B
Grid : Message : 10.470719 s : * SINGLE precision
Grid : Message : 10.470720 s : * Using Overlapped Comms/Compute
Grid : Message : 10.470721 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.470722 s : *****************************************************************
Grid : Message : 10.906815 s : Called warmup
Grid : Message : 97.838247 s : Called Dw 30000 times in 8.69313e+07 us
Grid : Message : 97.838307 s : mflop/s = 3.86905e+07
Grid : Message : 97.838310 s : mflop/s per rank = 1.20908e+06
Grid : Message : 97.838315 s : mflop/s per node = 4.83631e+06
Grid : Message : 97.838318 s : RF GiB/s (base 2) = 78618.2
Grid : Message : 97.838320 s : mem GiB/s (base 2) = 49136.3
Grid : Message : 97.838895 s : norm diff 1.05759e-13
Grid : Message : 97.848190 s : #### Dhop calls report
Grid : Message : 97.848197 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 97.848205 s : WilsonFermion5D TotalTime /Calls : 1449.98 us
Grid : Message : 97.848209 s : WilsonFermion5D CommTime /Calls : 1010.05 us
Grid : Message : 97.848212 s : WilsonFermion5D FaceTime /Calls : 217.72 us
Grid : Message : 97.848214 s : WilsonFermion5D ComputeTime1/Calls : 2.71694 us
Grid : Message : 97.848216 s : WilsonFermion5D ComputeTime2/Calls : 235.209 us
Grid : Message : 97.848291 s : Average mflops/s per call : 1.77649e+10
Grid : Message : 97.848295 s : Average mflops/s per call per rank : 5.55152e+08
Grid : Message : 97.848297 s : Average mflops/s per call per node : 2.22061e+09
Grid : Message : 97.848300 s : Average mflops/s per call (full) : 3.93633e+07
Grid : Message : 97.848304 s : Average mflops/s per call per rank (full): 1.2301e+06
Grid : Message : 97.848307 s : Average mflops/s per call per node (full): 4.92041e+06
Grid : Message : 97.848310 s : WilsonFermion5D Stencil
Grid : Message : 97.848311 s : WilsonFermion5D StencilEven
Grid : Message : 97.848313 s : WilsonFermion5D StencilOdd
Grid : Message : 97.848316 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 97.848321 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 97.848324 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 106.574196 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 106.574219 s : Called DwDag
Grid : Message : 106.574220 s : norm dag result 12.0422
Grid : Message : 106.576572 s : norm dag ref 12.0422
Grid : Message : 106.579538 s : norm dag diff 7.13141e-14
Grid : Message : 106.590622 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 106.652704 s : src_e0.499992
Grid : Message : 106.718436 s : src_o0.500008
Grid : Message : 106.735418 s : *********************************************************
Grid : Message : 106.735421 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 106.735423 s : * Vectorising space-time by 8
Grid : Message : 106.735424 s : * SINGLE precision
Grid : Message : 106.735425 s : * Using Overlapped Comms/Compute
Grid : Message : 106.735426 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 106.735427 s : *********************************************************
Grid : Message : 153.564697 s : Deo mflop/s = 3.59196e+07
Grid : Message : 153.564730 s : Deo mflop/s per rank 1.12249e+06
Grid : Message : 153.564732 s : Deo mflop/s per node 4.48995e+06
Grid : Message : 153.564735 s : #### Dhop calls report
Grid : Message : 153.564737 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 153.564739 s : WilsonFermion5D TotalTime /Calls : 1560.8 us
Grid : Message : 153.564741 s : WilsonFermion5D CommTime /Calls : 1055.83 us
Grid : Message : 153.564743 s : WilsonFermion5D FaceTime /Calls : 290.091 us
Grid : Message : 153.564745 s : WilsonFermion5D ComputeTime1/Calls : 4.74968 us
Grid : Message : 153.564747 s : WilsonFermion5D ComputeTime2/Calls : 239.675 us
Grid : Message : 153.564770 s : Average mflops/s per call : 1.01665e+10
Grid : Message : 153.564774 s : Average mflops/s per call per rank : 3.17702e+08
Grid : Message : 153.564776 s : Average mflops/s per call per node : 1.27081e+09
Grid : Message : 153.564778 s : Average mflops/s per call (full) : 3.65685e+07
Grid : Message : 153.564782 s : Average mflops/s per call per rank (full): 1.14277e+06
Grid : Message : 153.564785 s : Average mflops/s per call per node (full): 4.57107e+06
Grid : Message : 153.564787 s : WilsonFermion5D Stencil
Grid : Message : 153.564789 s : WilsonFermion5D StencilEven
Grid : Message : 153.564792 s : WilsonFermion5D StencilOdd
Grid : Message : 153.564794 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 153.564795 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 153.564796 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 153.584150 s : r_e6.02129
Grid : Message : 153.586497 s : r_o6.02097
Grid : Message : 153.587837 s : res12.0423
Grid : Message : 153.699087 s : norm diff 0
Grid : Message : 153.830654 s : norm diff even 0
Grid : Message : 153.894387 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1065
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1065
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:52:16 BST 2022
epoch 1661025136

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:00:44 BST 2022
epoch 1661025644

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe693be000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a11518b000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a114dc3000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a1148d1000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a1145a7000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a1142c6000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a114065000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a115112000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a113c85000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a112529000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a112159000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a111eb8000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a111d8d000)
libm.so.6 => /lib64/libm.so.6 (0x000014a111a0b000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a1117d4000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a1115bc000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a11139c000)
libc.so.6 => /lib64/libc.so.6 (0x000014a110fd7000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014a110dd3000)
/lib64/ld-linux-x86-64.so.2 (0x000014a114fdb000)
librt.so.1 => /lib64/librt.so.1 (0x000014a110bcb000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a115046000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a115041000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a110abf000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a1108b5000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014a1106b1000)

View File

@ -0,0 +1,254 @@
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ffa0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.565327 s : Grid Layout
Grid : Message : 1.565331 s : Global lattice size : 48 48 48 48
Grid : Message : 1.565336 s : OpenMP threads : 4
Grid : Message : 1.565338 s : MPI tasks : 2 2 2 4
Grid : Message : 1.576732 s : Making s innermost grids
Grid : Message : 1.591292 s : Initialising 4d RNG
Grid : Message : 1.607386 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.607406 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.733296 s : Initialising 5d RNG
Grid : Message : 1.967786 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.967813 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.633889 s : Initialised RNGs
Grid : Message : 5.699185 s : Drawing gauge field
Grid : Message : 5.800869 s : Random gauge initialised
Grid : Message : 5.804955 s : Setting up Cshift based reference
Grid : Message : 10.808527 s : *****************************************************************
Grid : Message : 10.808549 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.808551 s : *****************************************************************
Grid : Message : 10.808553 s : *****************************************************************
Grid : Message : 10.808554 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.808562 s : * Vectorising space-time by 8
Grid : Message : 10.808564 s : * VComplexF size is 64 B
Grid : Message : 10.808566 s : * SINGLE precision
Grid : Message : 10.808568 s : * Using Overlapped Comms/Compute
Grid : Message : 10.808570 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.808572 s : *****************************************************************
Grid : Message : 11.365381 s : Called warmup
Grid : Message : 97.739052 s : Called Dw 30000 times in 8.63735e+07 us
Grid : Message : 97.739106 s : mflop/s = 3.89403e+07
Grid : Message : 97.739108 s : mflop/s per rank = 1.21688e+06
Grid : Message : 97.739110 s : mflop/s per node = 4.86754e+06
Grid : Message : 97.739112 s : RF GiB/s (base 2) = 79125.8
Grid : Message : 97.739114 s : mem GiB/s (base 2) = 49453.6
Grid : Message : 97.739684 s : norm diff 1.05759e-13
Grid : Message : 97.749608 s : #### Dhop calls report
Grid : Message : 97.749616 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 97.749620 s : WilsonFermion5D TotalTime /Calls : 1440.85 us
Grid : Message : 97.749622 s : WilsonFermion5D CommTime /Calls : 1003.73 us
Grid : Message : 97.749624 s : WilsonFermion5D FaceTime /Calls : 215.311 us
Grid : Message : 97.749626 s : WilsonFermion5D ComputeTime1/Calls : 3.08775 us
Grid : Message : 97.749628 s : WilsonFermion5D ComputeTime2/Calls : 234.501 us
Grid : Message : 97.749646 s : Average mflops/s per call : 1.78099e+10
Grid : Message : 97.749650 s : Average mflops/s per call per rank : 5.5656e+08
Grid : Message : 97.749652 s : Average mflops/s per call per node : 2.22624e+09
Grid : Message : 97.749656 s : Average mflops/s per call (full) : 3.96128e+07
Grid : Message : 97.749659 s : Average mflops/s per call per rank (full): 1.2379e+06
Grid : Message : 97.749661 s : Average mflops/s per call per node (full): 4.9516e+06
Grid : Message : 97.749663 s : WilsonFermion5D Stencil
Grid : Message : 97.749665 s : WilsonFermion5D StencilEven
Grid : Message : 97.749668 s : WilsonFermion5D StencilOdd
Grid : Message : 97.749670 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 97.749672 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 97.749675 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 106.415478 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 106.415502 s : Called DwDag
Grid : Message : 106.415503 s : norm dag result 12.0422
Grid : Message : 106.429244 s : norm dag ref 12.0422
Grid : Message : 106.432306 s : norm dag diff 7.13141e-14
Grid : Message : 106.447571 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 106.514419 s : src_e0.499992
Grid : Message : 106.579087 s : src_o0.500008
Grid : Message : 106.595293 s : *********************************************************
Grid : Message : 106.595296 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 106.595297 s : * Vectorising space-time by 8
Grid : Message : 106.595298 s : * SINGLE precision
Grid : Message : 106.595299 s : * Using Overlapped Comms/Compute
Grid : Message : 106.595300 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 106.595301 s : *********************************************************
Grid : Message : 153.296330 s : Deo mflop/s = 3.62241e+07
Grid : Message : 153.296850 s : Deo mflop/s per rank 1.132e+06
Grid : Message : 153.296870 s : Deo mflop/s per node 4.52801e+06
Grid : Message : 153.296900 s : #### Dhop calls report
Grid : Message : 153.296920 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 153.296940 s : WilsonFermion5D TotalTime /Calls : 1547.63 us
Grid : Message : 153.296960 s : WilsonFermion5D CommTime /Calls : 1046.74 us
Grid : Message : 153.296980 s : WilsonFermion5D FaceTime /Calls : 286.366 us
Grid : Message : 153.297020 s : WilsonFermion5D ComputeTime1/Calls : 4.8817 us
Grid : Message : 153.297040 s : WilsonFermion5D ComputeTime2/Calls : 238.437 us
Grid : Message : 153.297230 s : Average mflops/s per call : 1.02065e+10
Grid : Message : 153.297270 s : Average mflops/s per call per rank : 3.18952e+08
Grid : Message : 153.297290 s : Average mflops/s per call per node : 1.27581e+09
Grid : Message : 153.297320 s : Average mflops/s per call (full) : 3.68796e+07
Grid : Message : 153.297340 s : Average mflops/s per call per rank (full): 1.15249e+06
Grid : Message : 153.297360 s : Average mflops/s per call per node (full): 4.60995e+06
Grid : Message : 153.297400 s : WilsonFermion5D Stencil
Grid : Message : 153.297410 s : WilsonFermion5D StencilEven
Grid : Message : 153.297420 s : WilsonFermion5D StencilOdd
Grid : Message : 153.297430 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 153.297460 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 153.297490 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 153.487210 s : r_e6.02129
Grid : Message : 153.503240 s : r_o6.02097
Grid : Message : 153.516860 s : res12.0423
Grid : Message : 153.160184 s : norm diff 0
Grid : Message : 153.295561 s : norm diff even 0
Grid : Message : 153.362804 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1080
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1080
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:58:05 BST 2022
epoch 1661025485

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:06:32 BST 2022
epoch 1661025992

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x000015143f705000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015143f685000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015143f2c3000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015143edd1000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015143eaa7000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015143e7c6000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015143e565000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015143f60c000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015143e185000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000015143ca29000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015143c659000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015143c3b8000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015143c28d000)
libm.so.6 => /lib64/libm.so.6 (0x000015143bf0b000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015143bcd4000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015143babc000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000015143b89c000)
libc.so.6 => /lib64/libc.so.6 (0x000015143b4d7000)
libdl.so.2 => /lib64/libdl.so.2 (0x000015143b2d3000)
/lib64/ld-linux-x86-64.so.2 (0x000015143f4db000)
librt.so.1 => /lib64/librt.so.1 (0x000015143b0cb000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015143f540000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015143f53b000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015143afbf000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015143adb5000)
libutil.so.1 => /lib64/libutil.so.1 (0x000015143abb1000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14b540000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.329290 s : Grid Layout
Grid : Message : 1.329294 s : Global lattice size : 48 48 48 48
Grid : Message : 1.329301 s : OpenMP threads : 4
Grid : Message : 1.329304 s : MPI tasks : 2 2 2 4
Grid : Message : 1.341902 s : Making s innermost grids
Grid : Message : 1.358246 s : Initialising 4d RNG
Grid : Message : 1.374403 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.374426 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.619784 s : Initialising 5d RNG
Grid : Message : 1.851516 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.851543 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.778347 s : Initialised RNGs
Grid : Message : 5.412229 s : Drawing gauge field
Grid : Message : 5.498501 s : Random gauge initialised
Grid : Message : 5.502681 s : Setting up Cshift based reference
Grid : Message : 10.568254 s : *****************************************************************
Grid : Message : 10.568272 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.568273 s : *****************************************************************
Grid : Message : 10.568274 s : *****************************************************************
Grid : Message : 10.568275 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.568276 s : * Vectorising space-time by 8
Grid : Message : 10.568277 s : * VComplexF size is 64 B
Grid : Message : 10.568278 s : * SINGLE precision
Grid : Message : 10.568279 s : * Using Overlapped Comms/Compute
Grid : Message : 10.568280 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.568281 s : *****************************************************************
Grid : Message : 11.130225 s : Called warmup
Grid : Message : 96.935259 s : Called Dw 30000 times in 8.58048e+07 us
Grid : Message : 96.935312 s : mflop/s = 3.91984e+07
Grid : Message : 96.935314 s : mflop/s per rank = 1.22495e+06
Grid : Message : 96.935316 s : mflop/s per node = 4.8998e+06
Grid : Message : 96.935318 s : RF GiB/s (base 2) = 79650.3
Grid : Message : 96.935320 s : mem GiB/s (base 2) = 49781.4
Grid : Message : 96.935891 s : norm diff 1.05759e-13
Grid : Message : 96.945419 s : #### Dhop calls report
Grid : Message : 96.945427 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 96.945430 s : WilsonFermion5D TotalTime /Calls : 1431.16 us
Grid : Message : 96.945432 s : WilsonFermion5D CommTime /Calls : 992.268 us
Grid : Message : 96.945434 s : WilsonFermion5D FaceTime /Calls : 217.135 us
Grid : Message : 96.945436 s : WilsonFermion5D ComputeTime1/Calls : 2.70928 us
Grid : Message : 96.945438 s : WilsonFermion5D ComputeTime2/Calls : 234.653 us
Grid : Message : 96.945454 s : Average mflops/s per call : 1.75489e+10
Grid : Message : 96.945461 s : Average mflops/s per call per rank : 5.48402e+08
Grid : Message : 96.945464 s : Average mflops/s per call per node : 2.19361e+09
Grid : Message : 96.945466 s : Average mflops/s per call (full) : 3.98811e+07
Grid : Message : 96.945469 s : Average mflops/s per call per rank (full): 1.24628e+06
Grid : Message : 96.945471 s : Average mflops/s per call per node (full): 4.98513e+06
Grid : Message : 96.945473 s : WilsonFermion5D Stencil
Grid : Message : 96.945475 s : WilsonFermion5D StencilEven
Grid : Message : 96.945477 s : WilsonFermion5D StencilOdd
Grid : Message : 96.945478 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 96.945479 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 96.945481 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 105.614164 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 105.614186 s : Called DwDag
Grid : Message : 105.614187 s : norm dag result 12.0422
Grid : Message : 105.616525 s : norm dag ref 12.0422
Grid : Message : 105.619641 s : norm dag diff 7.13141e-14
Grid : Message : 105.629645 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 105.695112 s : src_e0.499992
Grid : Message : 105.762145 s : src_o0.500008
Grid : Message : 105.778422 s : *********************************************************
Grid : Message : 105.778425 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 105.778429 s : * Vectorising space-time by 8
Grid : Message : 105.778431 s : * SINGLE precision
Grid : Message : 105.778432 s : * Using Overlapped Comms/Compute
Grid : Message : 105.778434 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 105.778436 s : *********************************************************
Grid : Message : 151.816932 s : Deo mflop/s = 3.6536e+07
Grid : Message : 151.816963 s : Deo mflop/s per rank 1.14175e+06
Grid : Message : 151.816965 s : Deo mflop/s per node 4.567e+06
Grid : Message : 151.816967 s : #### Dhop calls report
Grid : Message : 151.816969 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 151.816971 s : WilsonFermion5D TotalTime /Calls : 1534.48 us
Grid : Message : 151.816973 s : WilsonFermion5D CommTime /Calls : 1033.55 us
Grid : Message : 151.816975 s : WilsonFermion5D FaceTime /Calls : 286.448 us
Grid : Message : 151.816977 s : WilsonFermion5D ComputeTime1/Calls : 4.73748 us
Grid : Message : 151.816979 s : WilsonFermion5D ComputeTime2/Calls : 238.502 us
Grid : Message : 151.817000 s : Average mflops/s per call : 1.02127e+10
Grid : Message : 151.817004 s : Average mflops/s per call per rank : 3.19146e+08
Grid : Message : 151.817006 s : Average mflops/s per call per node : 1.27658e+09
Grid : Message : 151.817008 s : Average mflops/s per call (full) : 3.71958e+07
Grid : Message : 151.817013 s : Average mflops/s per call per rank (full): 1.16237e+06
Grid : Message : 151.817016 s : Average mflops/s per call per node (full): 4.64947e+06
Grid : Message : 151.817018 s : WilsonFermion5D Stencil
Grid : Message : 151.817020 s : WilsonFermion5D StencilEven
Grid : Message : 151.817022 s : WilsonFermion5D StencilOdd
Grid : Message : 151.817025 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 151.817026 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 151.817027 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 151.834960 s : r_e6.02129
Grid : Message : 151.836627 s : r_o6.02097
Grid : Message : 151.837999 s : res12.0423
Grid : Message : 151.953376 s : norm diff 0
Grid : Message : 152.791770 s : norm diff even 0
Grid : Message : 152.145659 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1095
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1095
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:03:54 BST 2022
epoch 1661025834

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:12:17 BST 2022
epoch 1661026337

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffdd3edf000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015078a487000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015078a0bf000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000150789bcd000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001507898a3000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001507895c2000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000150789361000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015078a40e000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000150788f81000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000150787825000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000150787455000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001507871b4000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000150787089000)
libm.so.6 => /lib64/libm.so.6 (0x0000150786d07000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000150786ad0000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001507868b8000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000150786698000)
libc.so.6 => /lib64/libc.so.6 (0x00001507862d3000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001507860cf000)
/lib64/ld-linux-x86-64.so.2 (0x000015078a2d7000)
librt.so.1 => /lib64/librt.so.1 (0x0000150785ec7000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015078a342000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015078a33d000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000150785dbb000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000150785bb1000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001507859ad000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14bc20000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.459618 s : Grid Layout
Grid : Message : 1.459622 s : Global lattice size : 48 48 48 48
Grid : Message : 1.459626 s : OpenMP threads : 4
Grid : Message : 1.459627 s : MPI tasks : 2 2 2 4
Grid : Message : 1.477216 s : Making s innermost grids
Grid : Message : 1.489637 s : Initialising 4d RNG
Grid : Message : 1.507425 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.507447 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.702250 s : Initialising 5d RNG
Grid : Message : 1.945333 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.945362 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.556125 s : Initialised RNGs
Grid : Message : 5.465171 s : Drawing gauge field
Grid : Message : 5.580137 s : Random gauge initialised
Grid : Message : 5.588368 s : Setting up Cshift based reference
Grid : Message : 10.584296 s : *****************************************************************
Grid : Message : 10.584315 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.584317 s : *****************************************************************
Grid : Message : 10.584318 s : *****************************************************************
Grid : Message : 10.584319 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.584320 s : * Vectorising space-time by 8
Grid : Message : 10.584321 s : * VComplexF size is 64 B
Grid : Message : 10.584322 s : * SINGLE precision
Grid : Message : 10.584323 s : * Using Overlapped Comms/Compute
Grid : Message : 10.584324 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.584325 s : *****************************************************************
Grid : Message : 11.140229 s : Called warmup
Grid : Message : 95.842020 s : Called Dw 30000 times in 8.47016e+07 us
Grid : Message : 95.842070 s : mflop/s = 3.9709e+07
Grid : Message : 95.842072 s : mflop/s per rank = 1.24091e+06
Grid : Message : 95.842074 s : mflop/s per node = 4.96362e+06
Grid : Message : 95.842076 s : RF GiB/s (base 2) = 80687.7
Grid : Message : 95.842078 s : mem GiB/s (base 2) = 50429.8
Grid : Message : 95.842652 s : norm diff 1.05759e-13
Grid : Message : 95.852740 s : #### Dhop calls report
Grid : Message : 95.852747 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 95.852750 s : WilsonFermion5D TotalTime /Calls : 1412.93 us
Grid : Message : 95.852752 s : WilsonFermion5D CommTime /Calls : 973.117 us
Grid : Message : 95.852754 s : WilsonFermion5D FaceTime /Calls : 219.979 us
Grid : Message : 95.852756 s : WilsonFermion5D ComputeTime1/Calls : 2.81676 us
Grid : Message : 95.852758 s : WilsonFermion5D ComputeTime2/Calls : 233.384 us
Grid : Message : 95.852786 s : Average mflops/s per call : 1.78331e+10
Grid : Message : 95.852789 s : Average mflops/s per call per rank : 5.57284e+08
Grid : Message : 95.852791 s : Average mflops/s per call per node : 2.22914e+09
Grid : Message : 95.852793 s : Average mflops/s per call (full) : 4.03955e+07
Grid : Message : 95.852795 s : Average mflops/s per call per rank (full): 1.26236e+06
Grid : Message : 95.852797 s : Average mflops/s per call per node (full): 5.04944e+06
Grid : Message : 95.852801 s : WilsonFermion5D Stencil
Grid : Message : 95.852803 s : WilsonFermion5D StencilEven
Grid : Message : 95.852805 s : WilsonFermion5D StencilOdd
Grid : Message : 95.852809 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 95.852811 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 95.852814 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 104.522368 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 104.522390 s : Called DwDag
Grid : Message : 104.522391 s : norm dag result 12.0422
Grid : Message : 104.526025 s : norm dag ref 12.0422
Grid : Message : 104.528967 s : norm dag diff 7.13141e-14
Grid : Message : 104.538859 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 104.603735 s : src_e0.499992
Grid : Message : 104.679776 s : src_o0.500008
Grid : Message : 104.696490 s : *********************************************************
Grid : Message : 104.696495 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 104.696498 s : * Vectorising space-time by 8
Grid : Message : 104.696500 s : * SINGLE precision
Grid : Message : 104.696502 s : * Using Overlapped Comms/Compute
Grid : Message : 104.696504 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 104.696506 s : *********************************************************
Grid : Message : 150.182043 s : Deo mflop/s = 3.69801e+07
Grid : Message : 150.182074 s : Deo mflop/s per rank 1.15563e+06
Grid : Message : 150.182079 s : Deo mflop/s per node 4.62251e+06
Grid : Message : 150.182082 s : #### Dhop calls report
Grid : Message : 150.182085 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 150.182089 s : WilsonFermion5D TotalTime /Calls : 1516 us
Grid : Message : 150.182093 s : WilsonFermion5D CommTime /Calls : 1019.81 us
Grid : Message : 150.182096 s : WilsonFermion5D FaceTime /Calls : 284.065 us
Grid : Message : 150.182100 s : WilsonFermion5D ComputeTime1/Calls : 4.84424 us
Grid : Message : 150.182103 s : WilsonFermion5D ComputeTime2/Calls : 236.64 us
Grid : Message : 150.182126 s : Average mflops/s per call : 1.01614e+10
Grid : Message : 150.182129 s : Average mflops/s per call per rank : 3.17542e+08
Grid : Message : 150.182131 s : Average mflops/s per call per node : 1.27017e+09
Grid : Message : 150.182133 s : Average mflops/s per call (full) : 3.76491e+07
Grid : Message : 150.182135 s : Average mflops/s per call per rank (full): 1.17653e+06
Grid : Message : 150.182139 s : Average mflops/s per call per node (full): 4.70614e+06
Grid : Message : 150.182141 s : WilsonFermion5D Stencil
Grid : Message : 150.182142 s : WilsonFermion5D StencilEven
Grid : Message : 150.182143 s : WilsonFermion5D StencilOdd
Grid : Message : 150.182144 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 150.182145 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 150.182146 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 150.201118 s : r_e6.02129
Grid : Message : 150.202964 s : r_o6.02097
Grid : Message : 150.204336 s : res12.0423
Grid : Message : 150.311999 s : norm diff 0
Grid : Message : 150.449845 s : norm diff even 0
Grid : Message : 150.531177 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1110
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1110
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:09:40 BST 2022
epoch 1661026180

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:18:02 BST 2022
epoch 1661026682

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x0000145d92144000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000145d920c4000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000145d91d02000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000145d91810000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000145d914e6000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000145d91205000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000145d90fa4000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000145d9204b000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000145d90bc4000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000145d8f468000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000145d8f098000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000145d8edf7000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000145d8eccc000)
libm.so.6 => /lib64/libm.so.6 (0x0000145d8e94a000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000145d8e713000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000145d8e4fb000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000145d8e2db000)
libc.so.6 => /lib64/libc.so.6 (0x0000145d8df16000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000145d8dd12000)
/lib64/ld-linux-x86-64.so.2 (0x0000145d91f1a000)
librt.so.1 => /lib64/librt.so.1 (0x0000145d8db0a000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000145d91f7f000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000145d91f7a000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000145d8d9fe000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000145d8d7f4000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000145d8d5f0000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x149500000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.406631 s : Grid Layout
Grid : Message : 1.406636 s : Global lattice size : 48 48 48 48
Grid : Message : 1.406643 s : OpenMP threads : 4
Grid : Message : 1.406646 s : MPI tasks : 2 2 2 4
Grid : Message : 1.417988 s : Making s innermost grids
Grid : Message : 1.428288 s : Initialising 4d RNG
Grid : Message : 1.444040 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.444063 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.681123 s : Initialising 5d RNG
Grid : Message : 1.913861 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.913892 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.748405 s : Initialised RNGs
Grid : Message : 5.494987 s : Drawing gauge field
Grid : Message : 5.593455 s : Random gauge initialised
Grid : Message : 5.600656 s : Setting up Cshift based reference
Grid : Message : 10.608322 s : *****************************************************************
Grid : Message : 10.608341 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.608342 s : *****************************************************************
Grid : Message : 10.608343 s : *****************************************************************
Grid : Message : 10.608344 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.608345 s : * Vectorising space-time by 8
Grid : Message : 10.608346 s : * VComplexF size is 64 B
Grid : Message : 10.608347 s : * SINGLE precision
Grid : Message : 10.608348 s : * Using Overlapped Comms/Compute
Grid : Message : 10.608349 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.608350 s : *****************************************************************
Grid : Message : 11.168504 s : Called warmup
Grid : Message : 94.732886 s : Called Dw 30000 times in 8.35639e+07 us
Grid : Message : 94.732942 s : mflop/s = 4.02496e+07
Grid : Message : 94.732944 s : mflop/s per rank = 1.2578e+06
Grid : Message : 94.732946 s : mflop/s per node = 5.0312e+06
Grid : Message : 94.732950 s : RF GiB/s (base 2) = 81786.2
Grid : Message : 94.732952 s : mem GiB/s (base 2) = 51116.4
Grid : Message : 94.733524 s : norm diff 1.05759e-13
Grid : Message : 94.743143 s : #### Dhop calls report
Grid : Message : 94.743150 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 94.743153 s : WilsonFermion5D TotalTime /Calls : 1394.23 us
Grid : Message : 94.743155 s : WilsonFermion5D CommTime /Calls : 949.994 us
Grid : Message : 94.743157 s : WilsonFermion5D FaceTime /Calls : 223.263 us
Grid : Message : 94.743159 s : WilsonFermion5D ComputeTime1/Calls : 2.79139 us
Grid : Message : 94.743161 s : WilsonFermion5D ComputeTime2/Calls : 233.597 us
Grid : Message : 94.743253 s : Average mflops/s per call : 1.76088e+10
Grid : Message : 94.743257 s : Average mflops/s per call per rank : 5.50276e+08
Grid : Message : 94.743259 s : Average mflops/s per call per node : 2.20111e+09
Grid : Message : 94.743261 s : Average mflops/s per call (full) : 4.09375e+07
Grid : Message : 94.743263 s : Average mflops/s per call per rank (full): 1.2793e+06
Grid : Message : 94.743266 s : Average mflops/s per call per node (full): 5.11718e+06
Grid : Message : 94.743269 s : WilsonFermion5D Stencil
Grid : Message : 94.743270 s : WilsonFermion5D StencilEven
Grid : Message : 94.743272 s : WilsonFermion5D StencilOdd
Grid : Message : 94.743275 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 94.743276 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 94.743279 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 103.414014 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 103.414035 s : Called DwDag
Grid : Message : 103.414036 s : norm dag result 12.0422
Grid : Message : 103.421887 s : norm dag ref 12.0422
Grid : Message : 103.424914 s : norm dag diff 7.13141e-14
Grid : Message : 103.435780 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 103.497971 s : src_e0.499992
Grid : Message : 103.565487 s : src_o0.500008
Grid : Message : 103.581935 s : *********************************************************
Grid : Message : 103.581939 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 103.581946 s : * Vectorising space-time by 8
Grid : Message : 103.581949 s : * SINGLE precision
Grid : Message : 103.581950 s : * Using Overlapped Comms/Compute
Grid : Message : 103.581954 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 103.581955 s : *********************************************************
Grid : Message : 148.394945 s : Deo mflop/s = 3.75373e+07
Grid : Message : 148.394972 s : Deo mflop/s per rank 1.17304e+06
Grid : Message : 148.394974 s : Deo mflop/s per node 4.69217e+06
Grid : Message : 148.394977 s : #### Dhop calls report
Grid : Message : 148.394979 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 148.394981 s : WilsonFermion5D TotalTime /Calls : 1493.62 us
Grid : Message : 148.394983 s : WilsonFermion5D CommTime /Calls : 994.118 us
Grid : Message : 148.394985 s : WilsonFermion5D FaceTime /Calls : 286.093 us
Grid : Message : 148.394987 s : WilsonFermion5D ComputeTime1/Calls : 4.89217 us
Grid : Message : 148.394989 s : WilsonFermion5D ComputeTime2/Calls : 236.395 us
Grid : Message : 148.395023 s : Average mflops/s per call : 1.01808e+10
Grid : Message : 148.395027 s : Average mflops/s per call per rank : 3.18151e+08
Grid : Message : 148.395030 s : Average mflops/s per call per node : 1.2726e+09
Grid : Message : 148.395032 s : Average mflops/s per call (full) : 3.82132e+07
Grid : Message : 148.395037 s : Average mflops/s per call per rank (full): 1.19416e+06
Grid : Message : 148.395041 s : Average mflops/s per call per node (full): 4.77665e+06
Grid : Message : 148.395044 s : WilsonFermion5D Stencil
Grid : Message : 148.395046 s : WilsonFermion5D StencilEven
Grid : Message : 148.395049 s : WilsonFermion5D StencilOdd
Grid : Message : 148.395051 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 148.395054 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 148.395056 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 148.413425 s : r_e6.02129
Grid : Message : 148.415423 s : r_o6.02097
Grid : Message : 148.416796 s : res12.0423
Grid : Message : 148.517046 s : norm diff 0
Grid : Message : 148.660878 s : norm diff even 0
Grid : Message : 148.726888 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1125
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1125
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:15:27 BST 2022
epoch 1661026527

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:23:44 BST 2022
epoch 1661027024

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffccdae5000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001492ab336000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001492aaf6e000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001492aaa7c000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001492aa752000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001492aa471000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001492aa210000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001492ab2bd000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001492a9e30000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001492a86d4000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001492a8304000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001492a8063000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001492a7f38000)
libm.so.6 => /lib64/libm.so.6 (0x00001492a7bb6000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001492a797f000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001492a7767000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001492a7547000)
libc.so.6 => /lib64/libc.so.6 (0x00001492a7182000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001492a6f7e000)
/lib64/ld-linux-x86-64.so.2 (0x00001492ab186000)
librt.so.1 => /lib64/librt.so.1 (0x00001492a6d76000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001492ab1f1000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001492ab1ec000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001492a6c6a000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001492a6a60000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001492a685c000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ad80000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.503383 s : Grid Layout
Grid : Message : 1.503386 s : Global lattice size : 48 48 48 48
Grid : Message : 1.503391 s : OpenMP threads : 4
Grid : Message : 1.503393 s : MPI tasks : 2 2 2 4
Grid : Message : 1.516942 s : Making s innermost grids
Grid : Message : 1.527235 s : Initialising 4d RNG
Grid : Message : 1.544084 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.544106 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.726178 s : Initialising 5d RNG
Grid : Message : 1.956255 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.956288 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 4.571228 s : Initialised RNGs
Grid : Message : 5.568412 s : Drawing gauge field
Grid : Message : 5.727363 s : Random gauge initialised
Grid : Message : 5.741177 s : Setting up Cshift based reference
Grid : Message : 10.690571 s : *****************************************************************
Grid : Message : 10.690591 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 10.690592 s : *****************************************************************
Grid : Message : 10.690593 s : *****************************************************************
Grid : Message : 10.690594 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 10.690595 s : * Vectorising space-time by 8
Grid : Message : 10.690596 s : * VComplexF size is 64 B
Grid : Message : 10.690597 s : * SINGLE precision
Grid : Message : 10.690600 s : * Using Overlapped Comms/Compute
Grid : Message : 10.690606 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 10.690607 s : *****************************************************************
Grid : Message : 11.155061 s : Called warmup
Grid : Message : 93.920472 s : Called Dw 30000 times in 8.27652e+07 us
Grid : Message : 93.920540 s : mflop/s = 4.0638e+07
Grid : Message : 93.920542 s : mflop/s per rank = 1.26994e+06
Grid : Message : 93.920544 s : mflop/s per node = 5.07975e+06
Grid : Message : 93.920546 s : RF GiB/s (base 2) = 82575.4
Grid : Message : 93.920548 s : mem GiB/s (base 2) = 51609.6
Grid : Message : 93.921119 s : norm diff 1.05759e-13
Grid : Message : 93.930750 s : #### Dhop calls report
Grid : Message : 93.930758 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 93.930761 s : WilsonFermion5D TotalTime /Calls : 1380.46 us
Grid : Message : 93.930763 s : WilsonFermion5D CommTime /Calls : 935.406 us
Grid : Message : 93.930765 s : WilsonFermion5D FaceTime /Calls : 223.911 us
Grid : Message : 93.930767 s : WilsonFermion5D ComputeTime1/Calls : 2.84526 us
Grid : Message : 93.930769 s : WilsonFermion5D ComputeTime2/Calls : 233.719 us
Grid : Message : 93.930799 s : Average mflops/s per call : 1.7744e+10
Grid : Message : 93.930802 s : Average mflops/s per call per rank : 5.54499e+08
Grid : Message : 93.930804 s : Average mflops/s per call per node : 2.218e+09
Grid : Message : 93.930806 s : Average mflops/s per call (full) : 4.13456e+07
Grid : Message : 93.930810 s : Average mflops/s per call per rank (full): 1.29205e+06
Grid : Message : 93.930812 s : Average mflops/s per call per node (full): 5.1682e+06
Grid : Message : 93.930814 s : WilsonFermion5D Stencil
Grid : Message : 93.930815 s : WilsonFermion5D StencilEven
Grid : Message : 93.930817 s : WilsonFermion5D StencilOdd
Grid : Message : 93.930818 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 93.930819 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 93.930820 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 102.631972 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 102.631993 s : Called DwDag
Grid : Message : 102.631994 s : norm dag result 12.0422
Grid : Message : 102.634228 s : norm dag ref 12.0422
Grid : Message : 102.637138 s : norm dag diff 7.13141e-14
Grid : Message : 102.646956 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 102.713481 s : src_e0.499992
Grid : Message : 102.788976 s : src_o0.500008
Grid : Message : 102.805384 s : *********************************************************
Grid : Message : 102.805388 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 102.805391 s : * Vectorising space-time by 8
Grid : Message : 102.805393 s : * SINGLE precision
Grid : Message : 102.805396 s : * Using Overlapped Comms/Compute
Grid : Message : 102.805398 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 102.805400 s : *********************************************************
Grid : Message : 147.202877 s : Deo mflop/s = 3.78867e+07
Grid : Message : 147.202909 s : Deo mflop/s per rank 1.18396e+06
Grid : Message : 147.202914 s : Deo mflop/s per node 4.73584e+06
Grid : Message : 147.202918 s : #### Dhop calls report
Grid : Message : 147.202920 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 147.202923 s : WilsonFermion5D TotalTime /Calls : 1479.75 us
Grid : Message : 147.202927 s : WilsonFermion5D CommTime /Calls : 975.318 us
Grid : Message : 147.202929 s : WilsonFermion5D FaceTime /Calls : 293.474 us
Grid : Message : 147.202932 s : WilsonFermion5D ComputeTime1/Calls : 4.93714 us
Grid : Message : 147.202935 s : WilsonFermion5D ComputeTime2/Calls : 236.494 us
Grid : Message : 147.202962 s : Average mflops/s per call : 1.02376e+10
Grid : Message : 147.202965 s : Average mflops/s per call per rank : 3.19924e+08
Grid : Message : 147.202967 s : Average mflops/s per call per node : 1.2797e+09
Grid : Message : 147.202969 s : Average mflops/s per call (full) : 3.85713e+07
Grid : Message : 147.202971 s : Average mflops/s per call per rank (full): 1.20535e+06
Grid : Message : 147.202973 s : Average mflops/s per call per node (full): 4.82142e+06
Grid : Message : 147.202976 s : WilsonFermion5D Stencil
Grid : Message : 147.202978 s : WilsonFermion5D StencilEven
Grid : Message : 147.202980 s : WilsonFermion5D StencilOdd
Grid : Message : 147.202982 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 147.202985 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 147.202988 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 147.219874 s : r_e6.02129
Grid : Message : 147.221715 s : r_o6.02097
Grid : Message : 147.223077 s : res12.0423
Grid : Message : 147.332465 s : norm diff 0
Grid : Message : 147.471882 s : norm diff even 0
Grid : Message : 147.546548 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-8A-1140
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1140
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 48.48.48.48 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:21:10 BST 2022
epoch 1661026870

Some files were not shown because too many files have changed in this diff Show More