Initial commit

This commit is contained in:
Antonin Portelli 2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.code-workspace

BIN
2-racks/rack-power.db Normal file

Binary file not shown.

View File

@ -0,0 +1,5 @@
nnodes : 16
ntasks : 64
partition : gpu
mpi-geom : 2.2.2.8
grid-geom : 48.48.48.96

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
numa=${lrank}
cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))"
places="$(( lrank*16 )):$(( (lrank+1)*16 ))"
BINDING="taskset -c ${cpus} numactl -m ${numa}"
export OMP_PLACES=${places}
echo "$(hostname) - ${lrank} binding='${BINDING}'"
${BINDING} "$@"

View File

@ -0,0 +1 @@
../dwf_fp32.tok

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
numa1=$(( 2 * lrank))
numa2=$(( 2 * lrank + 1 ))
netdev=mlx5_${lrank}:1
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
export UCX_NET_DEVICES=${netdev}
BINDING="--interleave=$numa1,$numa2"
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
numactl ${BINDING} "$@"

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:25:12 BST 2022
epoch 1661023512

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffef5f3f000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015459e0bd000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015459dcf5000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015459d803000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015459d4d9000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015459d1f8000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015459cf97000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015459e044000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015459cbb7000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000015459b45b000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015459b08b000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015459adea000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015459acbf000)
libm.so.6 => /lib64/libm.so.6 (0x000015459a93d000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015459a706000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015459a4ee000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000015459a2ce000)
libc.so.6 => /lib64/libc.so.6 (0x0000154599f09000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000154599d05000)
/lib64/ld-linux-x86-64.so.2 (0x000015459df0d000)
librt.so.1 => /lib64/librt.so.1 (0x0000154599afd000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015459df78000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015459df73000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001545999f1000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001545997e7000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001545995e3000)

View File

@ -0,0 +1,286 @@
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ea00000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.499143 s : Grid Layout
Grid : Message : 1.499148 s : Global lattice size : 48 48 48 96
Grid : Message : 1.499155 s : OpenMP threads : 4
Grid : Message : 1.499157 s : MPI tasks : 2 2 2 8
Grid : Message : 1.515541 s : Making s innermost grids
Grid : Message : 1.532470 s : Initialising 4d RNG
Grid : Message : 1.550455 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.550491 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.937366 s : Initialising 5d RNG
Grid : Message : 2.163040 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.163078 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.467109 s : Initialised RNGs
Grid : Message : 8.261272 s : Drawing gauge field
Grid : Message : 8.380110 s : Random gauge initialised
Grid : Message : 8.388989 s : Setting up Cshift based reference
Grid : Message : 13.599668 s : *****************************************************************
Grid : Message : 13.599694 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.599696 s : *****************************************************************
Grid : Message : 13.599700 s : *****************************************************************
Grid : Message : 13.599702 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.599705 s : * Vectorising space-time by 8
Grid : Message : 13.599708 s : * VComplexF size is 64 B
Grid : Message : 13.599710 s : * SINGLE precision
Grid : Message : 13.599712 s : * Using Overlapped Comms/Compute
Grid : Message : 13.599716 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.599719 s : *****************************************************************
Grid : Message : 14.992290 s : Called warmup
Grid : Message : 104.236264 s : Called Dw 30000 times in 9.01365e+07 us
Grid : Message : 104.236329 s : mflop/s = 7.46293e+07
Grid : Message : 104.236331 s : mflop/s per rank = 1.16608e+06
Grid : Message : 104.236333 s : mflop/s per node = 4.66433e+06
Grid : Message : 104.236335 s : RF GiB/s (base 2) = 151645
Grid : Message : 104.236337 s : mem GiB/s (base 2) = 94778.1
Grid : Message : 104.236908 s : norm diff 1.05775e-13
Grid : Message : 104.247209 s : #### Dhop calls report
Grid : Message : 104.247215 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 104.247219 s : WilsonFermion5D TotalTime /Calls : 1503.52 us
Grid : Message : 104.247221 s : WilsonFermion5D CommTime /Calls : 1054.2 us
Grid : Message : 104.247223 s : WilsonFermion5D FaceTime /Calls : 225.375 us
Grid : Message : 104.247225 s : WilsonFermion5D ComputeTime1/Calls : 3.01152 us
Grid : Message : 104.247227 s : WilsonFermion5D ComputeTime2/Calls : 236.377 us
Grid : Message : 104.247294 s : Average mflops/s per call : 3.59587e+10
Grid : Message : 104.247300 s : Average mflops/s per call per rank : 5.61855e+08
Grid : Message : 104.247303 s : Average mflops/s per call per node : 2.24742e+09
Grid : Message : 104.247305 s : Average mflops/s per call (full) : 7.59233e+07
Grid : Message : 104.247307 s : Average mflops/s per call per rank (full): 1.1863e+06
Grid : Message : 104.247309 s : Average mflops/s per call per node (full): 4.7452e+06
Grid : Message : 104.247311 s : WilsonFermion5D Stencil
Grid : Message : 104.247312 s : WilsonFermion5D StencilEven
Grid : Message : 104.247313 s : WilsonFermion5D StencilOdd
Grid : Message : 104.247314 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 104.247315 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 104.247316 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 112.998074 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 112.998099 s : Called DwDag
Grid : Message : 112.998100 s : norm dag result 12.0422
Grid : Message : 113.585000 s : norm dag ref 12.0422
Grid : Message : 113.380300 s : norm dag diff 7.28899e-14
Grid : Message : 113.140290 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 113.790730 s : src_e0.5
Grid : Message : 113.153215 s : src_o0.5
Grid : Message : 113.170341 s : *********************************************************
Grid : Message : 113.170346 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 113.170347 s : * Vectorising space-time by 8
Grid : Message : 113.170353 s : * SINGLE precision
Grid : Message : 113.170356 s : * Using Overlapped Comms/Compute
Grid : Message : 113.170357 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 113.170361 s : *********************************************************
Grid : Message : 161.702832 s : Deo mflop/s = 6.93159e+07
Grid : Message : 161.702861 s : Deo mflop/s per rank 1.08306e+06
Grid : Message : 161.702863 s : Deo mflop/s per node 4.33224e+06
Grid : Message : 161.702866 s : #### Dhop calls report
Grid : Message : 161.702868 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 161.702870 s : WilsonFermion5D TotalTime /Calls : 1617.57 us
Grid : Message : 161.702872 s : WilsonFermion5D CommTime /Calls : 1105.14 us
Grid : Message : 161.702874 s : WilsonFermion5D FaceTime /Calls : 294.218 us
Grid : Message : 161.702876 s : WilsonFermion5D ComputeTime1/Calls : 4.85114 us
Grid : Message : 161.702878 s : WilsonFermion5D ComputeTime2/Calls : 241.569 us
Grid : Message : 161.702900 s : Average mflops/s per call : 2.0686e+10
Grid : Message : 161.702904 s : Average mflops/s per call per rank : 3.23219e+08
Grid : Message : 161.702906 s : Average mflops/s per call per node : 1.29288e+09
Grid : Message : 161.702908 s : Average mflops/s per call (full) : 7.05701e+07
Grid : Message : 161.702912 s : Average mflops/s per call per rank (full): 1.10266e+06
Grid : Message : 161.702914 s : Average mflops/s per call per node (full): 4.41063e+06
Grid : Message : 161.702920 s : WilsonFermion5D Stencil
Grid : Message : 161.702922 s : WilsonFermion5D StencilEven
Grid : Message : 161.702923 s : WilsonFermion5D StencilOdd
Grid : Message : 161.702926 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 161.702927 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 161.702928 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 161.722751 s : r_e6.02106
Grid : Message : 161.724439 s : r_o6.0211
Grid : Message : 161.725861 s : res12.0422
Grid : Message : 161.827558 s : norm diff 0
Grid : Message : 161.972191 s : norm diff even 0
Grid : Message : 162.433730 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1005
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1005
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:22:21 BST 2022
epoch 1661023341

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:37:35 BST 2022
epoch 1661024255

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffff456d000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154c9a375000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154c99fad000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154c99abb000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154c99791000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154c994b0000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154c9924f000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154c9a2fc000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154c98e6f000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154c97713000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154c97343000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154c970a2000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154c96f77000)
libm.so.6 => /lib64/libm.so.6 (0x0000154c96bf5000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154c969be000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154c967a6000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154c96586000)
libc.so.6 => /lib64/libc.so.6 (0x0000154c961c1000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000154c95fbd000)
/lib64/ld-linux-x86-64.so.2 (0x0000154c9a1c5000)
librt.so.1 => /lib64/librt.so.1 (0x0000154c95db5000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154c9a230000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154c9a22b000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154c95ca9000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154c95a9f000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000154c9589b000)

View File

@ -0,0 +1,286 @@
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14d8e0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.312638 s : Grid Layout
Grid : Message : 1.312643 s : Global lattice size : 48 48 48 96
Grid : Message : 1.312650 s : OpenMP threads : 4
Grid : Message : 1.312652 s : MPI tasks : 2 2 2 8
Grid : Message : 1.327971 s : Making s innermost grids
Grid : Message : 1.344471 s : Initialising 4d RNG
Grid : Message : 1.361018 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.361045 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.837887 s : Initialising 5d RNG
Grid : Message : 2.844490 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.845110 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.428202 s : Initialised RNGs
Grid : Message : 8.439960 s : Drawing gauge field
Grid : Message : 8.560999 s : Random gauge initialised
Grid : Message : 8.573339 s : Setting up Cshift based reference
Grid : Message : 13.695651 s : *****************************************************************
Grid : Message : 13.695676 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.695677 s : *****************************************************************
Grid : Message : 13.695678 s : *****************************************************************
Grid : Message : 13.695679 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.695680 s : * Vectorising space-time by 8
Grid : Message : 13.695681 s : * VComplexF size is 64 B
Grid : Message : 13.695682 s : * SINGLE precision
Grid : Message : 13.695684 s : * Using Overlapped Comms/Compute
Grid : Message : 13.695685 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.695686 s : *****************************************************************
Grid : Message : 14.234933 s : Called warmup
Grid : Message : 103.428452 s : Called Dw 30000 times in 8.91932e+07 us
Grid : Message : 103.428517 s : mflop/s = 7.54186e+07
Grid : Message : 103.428519 s : mflop/s per rank = 1.17842e+06
Grid : Message : 103.428521 s : mflop/s per node = 4.71366e+06
Grid : Message : 103.428523 s : RF GiB/s (base 2) = 153249
Grid : Message : 103.428525 s : mem GiB/s (base 2) = 95780.5
Grid : Message : 103.429097 s : norm diff 1.05775e-13
Grid : Message : 103.439111 s : #### Dhop calls report
Grid : Message : 103.439118 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 103.439122 s : WilsonFermion5D TotalTime /Calls : 1487.69 us
Grid : Message : 103.439124 s : WilsonFermion5D CommTime /Calls : 1041.46 us
Grid : Message : 103.439126 s : WilsonFermion5D FaceTime /Calls : 222.459 us
Grid : Message : 103.439128 s : WilsonFermion5D ComputeTime1/Calls : 2.85969 us
Grid : Message : 103.439130 s : WilsonFermion5D ComputeTime2/Calls : 236.325 us
Grid : Message : 103.439201 s : Average mflops/s per call : 3.60313e+10
Grid : Message : 103.439207 s : Average mflops/s per call per rank : 5.62989e+08
Grid : Message : 103.439209 s : Average mflops/s per call per node : 2.25196e+09
Grid : Message : 103.439211 s : Average mflops/s per call (full) : 7.67311e+07
Grid : Message : 103.439213 s : Average mflops/s per call per rank (full): 1.19892e+06
Grid : Message : 103.439215 s : Average mflops/s per call per node (full): 4.7957e+06
Grid : Message : 103.439217 s : WilsonFermion5D Stencil
Grid : Message : 103.439218 s : WilsonFermion5D StencilEven
Grid : Message : 103.439219 s : WilsonFermion5D StencilOdd
Grid : Message : 103.439220 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 103.439221 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 103.439222 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 112.177904 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 112.177939 s : Called DwDag
Grid : Message : 112.177940 s : norm dag result 12.0422
Grid : Message : 112.186235 s : norm dag ref 12.0422
Grid : Message : 112.189309 s : norm dag diff 7.28899e-14
Grid : Message : 112.200523 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 112.263704 s : src_e0.5
Grid : Message : 112.335429 s : src_o0.5
Grid : Message : 112.352238 s : *********************************************************
Grid : Message : 112.352244 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 112.352246 s : * Vectorising space-time by 8
Grid : Message : 112.352248 s : * SINGLE precision
Grid : Message : 112.352250 s : * Using Overlapped Comms/Compute
Grid : Message : 112.352253 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 112.352254 s : *********************************************************
Grid : Message : 160.328889 s : Deo mflop/s = 7.01193e+07
Grid : Message : 160.328922 s : Deo mflop/s per rank 1.09561e+06
Grid : Message : 160.328924 s : Deo mflop/s per node 4.38246e+06
Grid : Message : 160.328927 s : #### Dhop calls report
Grid : Message : 160.328929 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 160.328931 s : WilsonFermion5D TotalTime /Calls : 1599.04 us
Grid : Message : 160.328933 s : WilsonFermion5D CommTime /Calls : 1088.05 us
Grid : Message : 160.328935 s : WilsonFermion5D FaceTime /Calls : 294.436 us
Grid : Message : 160.328937 s : WilsonFermion5D ComputeTime1/Calls : 4.78577 us
Grid : Message : 160.328939 s : WilsonFermion5D ComputeTime2/Calls : 241.411 us
Grid : Message : 160.328966 s : Average mflops/s per call : 2.07599e+10
Grid : Message : 160.328971 s : Average mflops/s per call per rank : 3.24373e+08
Grid : Message : 160.328975 s : Average mflops/s per call per node : 1.29749e+09
Grid : Message : 160.328980 s : Average mflops/s per call (full) : 7.13878e+07
Grid : Message : 160.328983 s : Average mflops/s per call per rank (full): 1.11543e+06
Grid : Message : 160.328987 s : Average mflops/s per call per node (full): 4.46174e+06
Grid : Message : 160.328989 s : WilsonFermion5D Stencil
Grid : Message : 160.328990 s : WilsonFermion5D StencilEven
Grid : Message : 160.328992 s : WilsonFermion5D StencilOdd
Grid : Message : 160.328995 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 160.328997 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 160.329000 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 160.348014 s : r_e6.02106
Grid : Message : 160.350033 s : r_o6.0211
Grid : Message : 160.351497 s : res12.0422
Grid : Message : 160.466811 s : norm diff 0
Grid : Message : 160.599190 s : norm diff even 0
Grid : Message : 160.669838 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1020
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1020
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:34:46 BST 2022
epoch 1661024086

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:43:25 BST 2022
epoch 1661024605

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffd625a8000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ff21a6a000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ff216a2000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ff211b0000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ff20e86000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ff20ba5000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ff20944000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ff219f1000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ff20564000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ff1ee08000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ff1ea38000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ff1e797000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ff1e66c000)
libm.so.6 => /lib64/libm.so.6 (0x000014ff1e2ea000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ff1e0b3000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ff1de9b000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ff1dc7b000)
libc.so.6 => /lib64/libc.so.6 (0x000014ff1d8b6000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014ff1d6b2000)
/lib64/ld-linux-x86-64.so.2 (0x000014ff218ba000)
librt.so.1 => /lib64/librt.so.1 (0x000014ff1d4aa000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ff21925000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ff21920000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ff1d39e000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ff1d194000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014ff1cf90000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146a80000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.623478 s : Grid Layout
Grid : Message : 1.623482 s : Global lattice size : 48 48 48 96
Grid : Message : 1.623486 s : OpenMP threads : 4
Grid : Message : 1.623488 s : MPI tasks : 2 2 2 8
Grid : Message : 1.637678 s : Making s innermost grids
Grid : Message : 1.654638 s : Initialising 4d RNG
Grid : Message : 1.670417 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.670443 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.165386 s : Initialising 5d RNG
Grid : Message : 2.399472 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.399504 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.787095 s : Initialised RNGs
Grid : Message : 8.568006 s : Drawing gauge field
Grid : Message : 8.661012 s : Random gauge initialised
Grid : Message : 8.665024 s : Setting up Cshift based reference
Grid : Message : 13.760660 s : *****************************************************************
Grid : Message : 13.760685 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.760687 s : *****************************************************************
Grid : Message : 13.760690 s : *****************************************************************
Grid : Message : 13.760691 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.760692 s : * Vectorising space-time by 8
Grid : Message : 13.760694 s : * VComplexF size is 64 B
Grid : Message : 13.760696 s : * SINGLE precision
Grid : Message : 13.760697 s : * Using Overlapped Comms/Compute
Grid : Message : 13.760698 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.760700 s : *****************************************************************
Grid : Message : 14.326353 s : Called warmup
Grid : Message : 102.469231 s : Called Dw 30000 times in 8.81428e+07 us
Grid : Message : 102.469296 s : mflop/s = 7.63173e+07
Grid : Message : 102.469299 s : mflop/s per rank = 1.19246e+06
Grid : Message : 102.469307 s : mflop/s per node = 4.76983e+06
Grid : Message : 102.469310 s : RF GiB/s (base 2) = 155075
Grid : Message : 102.469313 s : mem GiB/s (base 2) = 96921.9
Grid : Message : 102.469886 s : norm diff 1.05775e-13
Grid : Message : 102.480527 s : #### Dhop calls report
Grid : Message : 102.480534 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 102.480538 s : WilsonFermion5D TotalTime /Calls : 1470.47 us
Grid : Message : 102.480540 s : WilsonFermion5D CommTime /Calls : 1029.89 us
Grid : Message : 102.480542 s : WilsonFermion5D FaceTime /Calls : 217.938 us
Grid : Message : 102.480544 s : WilsonFermion5D ComputeTime1/Calls : 3.09645 us
Grid : Message : 102.480546 s : WilsonFermion5D ComputeTime2/Calls : 235.402 us
Grid : Message : 102.480575 s : Average mflops/s per call : 3.61099e+10
Grid : Message : 102.480579 s : Average mflops/s per call per rank : 5.64217e+08
Grid : Message : 102.480581 s : Average mflops/s per call per node : 2.25687e+09
Grid : Message : 102.480583 s : Average mflops/s per call (full) : 7.76299e+07
Grid : Message : 102.480587 s : Average mflops/s per call per rank (full): 1.21297e+06
Grid : Message : 102.480590 s : Average mflops/s per call per node (full): 4.85187e+06
Grid : Message : 102.480593 s : WilsonFermion5D Stencil
Grid : Message : 102.480596 s : WilsonFermion5D StencilEven
Grid : Message : 102.480598 s : WilsonFermion5D StencilOdd
Grid : Message : 102.480600 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 102.480603 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 102.480605 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 111.202302 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 111.202331 s : Called DwDag
Grid : Message : 111.202332 s : norm dag result 12.0422
Grid : Message : 111.204652 s : norm dag ref 12.0422
Grid : Message : 111.207748 s : norm dag diff 7.28899e-14
Grid : Message : 111.218376 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 111.273653 s : src_e0.5
Grid : Message : 111.352934 s : src_o0.5
Grid : Message : 111.369965 s : *********************************************************
Grid : Message : 111.369970 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 111.369974 s : * Vectorising space-time by 8
Grid : Message : 111.369976 s : * SINGLE precision
Grid : Message : 111.369977 s : * Using Overlapped Comms/Compute
Grid : Message : 111.369981 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 111.369983 s : *********************************************************
Grid : Message : 158.806725 s : Deo mflop/s = 7.09164e+07
Grid : Message : 158.806755 s : Deo mflop/s per rank 1.10807e+06
Grid : Message : 158.806757 s : Deo mflop/s per node 4.43227e+06
Grid : Message : 158.806760 s : #### Dhop calls report
Grid : Message : 158.806762 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 158.806764 s : WilsonFermion5D TotalTime /Calls : 1581.06 us
Grid : Message : 158.806766 s : WilsonFermion5D CommTime /Calls : 1077.77 us
Grid : Message : 158.806768 s : WilsonFermion5D FaceTime /Calls : 286.721 us
Grid : Message : 158.806770 s : WilsonFermion5D ComputeTime1/Calls : 4.98297 us
Grid : Message : 158.806772 s : WilsonFermion5D ComputeTime2/Calls : 240.035 us
Grid : Message : 158.806792 s : Average mflops/s per call : 2.0753e+10
Grid : Message : 158.806796 s : Average mflops/s per call per rank : 3.24266e+08
Grid : Message : 158.806798 s : Average mflops/s per call per node : 1.29706e+09
Grid : Message : 158.806800 s : Average mflops/s per call (full) : 7.21996e+07
Grid : Message : 158.806804 s : Average mflops/s per call per rank (full): 1.12812e+06
Grid : Message : 158.806807 s : Average mflops/s per call per node (full): 4.51247e+06
Grid : Message : 158.806809 s : WilsonFermion5D Stencil
Grid : Message : 158.806810 s : WilsonFermion5D StencilEven
Grid : Message : 158.806812 s : WilsonFermion5D StencilOdd
Grid : Message : 158.806814 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 158.806816 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 158.806818 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 158.823821 s : r_e6.02106
Grid : Message : 158.827207 s : r_o6.0211
Grid : Message : 158.828617 s : res12.0422
Grid : Message : 158.938772 s : norm diff 0
Grid : Message : 159.724700 s : norm diff even 0
Grid : Message : 159.148761 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1035
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1035
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:40:36 BST 2022
epoch 1661024436

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:49:15 BST 2022
epoch 1661024955

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe2b5fb000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470cbce5000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470cb91d000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470cb42b000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470cb101000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470cae20000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470cabbf000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470cbc6c000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470ca7df000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470c9083000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470c8cb3000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470c8a12000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470c88e7000)
libm.so.6 => /lib64/libm.so.6 (0x00001470c8565000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470c832e000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470c8116000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470c7ef6000)
libc.so.6 => /lib64/libc.so.6 (0x00001470c7b31000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001470c792d000)
/lib64/ld-linux-x86-64.so.2 (0x00001470cbb35000)
librt.so.1 => /lib64/librt.so.1 (0x00001470c7725000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470cbba0000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470cbb9b000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470c7619000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470c740f000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001470c720b000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f600000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.720184 s : Grid Layout
Grid : Message : 1.720188 s : Global lattice size : 48 48 48 96
Grid : Message : 1.720196 s : OpenMP threads : 4
Grid : Message : 1.720199 s : MPI tasks : 2 2 2 8
Grid : Message : 1.735275 s : Making s innermost grids
Grid : Message : 1.752323 s : Initialising 4d RNG
Grid : Message : 1.768478 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.768504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.201838 s : Initialising 5d RNG
Grid : Message : 2.438683 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.438714 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.906459 s : Initialised RNGs
Grid : Message : 8.718015 s : Drawing gauge field
Grid : Message : 8.851801 s : Random gauge initialised
Grid : Message : 8.862438 s : Setting up Cshift based reference
Grid : Message : 13.896599 s : *****************************************************************
Grid : Message : 13.896621 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.896622 s : *****************************************************************
Grid : Message : 13.896623 s : *****************************************************************
Grid : Message : 13.896624 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.896625 s : * Vectorising space-time by 8
Grid : Message : 13.896626 s : * VComplexF size is 64 B
Grid : Message : 13.896627 s : * SINGLE precision
Grid : Message : 13.896628 s : * Using Overlapped Comms/Compute
Grid : Message : 13.896629 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.896630 s : *****************************************************************
Grid : Message : 14.428387 s : Called warmup
Grid : Message : 101.915473 s : Called Dw 30000 times in 8.74869e+07 us
Grid : Message : 101.915527 s : mflop/s = 7.68895e+07
Grid : Message : 101.915529 s : mflop/s per rank = 1.2014e+06
Grid : Message : 101.915531 s : mflop/s per node = 4.80559e+06
Grid : Message : 101.915533 s : RF GiB/s (base 2) = 156238
Grid : Message : 101.915535 s : mem GiB/s (base 2) = 97648.5
Grid : Message : 101.916107 s : norm diff 1.05775e-13
Grid : Message : 101.926218 s : #### Dhop calls report
Grid : Message : 101.926225 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 101.926228 s : WilsonFermion5D TotalTime /Calls : 1459.21 us
Grid : Message : 101.926230 s : WilsonFermion5D CommTime /Calls : 1016.78 us
Grid : Message : 101.926232 s : WilsonFermion5D FaceTime /Calls : 219.506 us
Grid : Message : 101.926234 s : WilsonFermion5D ComputeTime1/Calls : 2.78512 us
Grid : Message : 101.926236 s : WilsonFermion5D ComputeTime2/Calls : 235.25 us
Grid : Message : 101.926330 s : Average mflops/s per call : 3.60206e+10
Grid : Message : 101.926334 s : Average mflops/s per call per rank : 5.62822e+08
Grid : Message : 101.926336 s : Average mflops/s per call per node : 2.25129e+09
Grid : Message : 101.926338 s : Average mflops/s per call (full) : 7.82287e+07
Grid : Message : 101.926340 s : Average mflops/s per call per rank (full): 1.22232e+06
Grid : Message : 101.926342 s : Average mflops/s per call per node (full): 4.88929e+06
Grid : Message : 101.926344 s : WilsonFermion5D Stencil
Grid : Message : 101.926345 s : WilsonFermion5D StencilEven
Grid : Message : 101.926346 s : WilsonFermion5D StencilOdd
Grid : Message : 101.926347 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 101.926348 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 101.926349 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 110.616405 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 110.616430 s : Called DwDag
Grid : Message : 110.616431 s : norm dag result 12.0422
Grid : Message : 110.621134 s : norm dag ref 12.0422
Grid : Message : 110.624323 s : norm dag diff 7.28899e-14
Grid : Message : 110.637247 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 110.698940 s : src_e0.5
Grid : Message : 110.766761 s : src_o0.5
Grid : Message : 110.783307 s : *********************************************************
Grid : Message : 110.783311 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 110.783313 s : * Vectorising space-time by 8
Grid : Message : 110.783315 s : * SINGLE precision
Grid : Message : 110.783316 s : * Using Overlapped Comms/Compute
Grid : Message : 110.783317 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 110.783318 s : *********************************************************
Grid : Message : 157.764942 s : Deo mflop/s = 7.16075e+07
Grid : Message : 157.764976 s : Deo mflop/s per rank 1.11887e+06
Grid : Message : 157.764978 s : Deo mflop/s per node 4.47547e+06
Grid : Message : 157.764981 s : #### Dhop calls report
Grid : Message : 157.764983 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 157.764985 s : WilsonFermion5D TotalTime /Calls : 1565.89 us
Grid : Message : 157.764987 s : WilsonFermion5D CommTime /Calls : 1058.27 us
Grid : Message : 157.764989 s : WilsonFermion5D FaceTime /Calls : 292.487 us
Grid : Message : 157.764991 s : WilsonFermion5D ComputeTime1/Calls : 4.72584 us
Grid : Message : 157.764993 s : WilsonFermion5D ComputeTime2/Calls : 239.678 us
Grid : Message : 157.765020 s : Average mflops/s per call : 2.07994e+10
Grid : Message : 157.765024 s : Average mflops/s per call per rank : 3.2499e+08
Grid : Message : 157.765027 s : Average mflops/s per call per node : 1.29996e+09
Grid : Message : 157.765031 s : Average mflops/s per call (full) : 7.28994e+07
Grid : Message : 157.765035 s : Average mflops/s per call per rank (full): 1.13905e+06
Grid : Message : 157.765039 s : Average mflops/s per call per node (full): 4.55621e+06
Grid : Message : 157.765042 s : WilsonFermion5D Stencil
Grid : Message : 157.765044 s : WilsonFermion5D StencilEven
Grid : Message : 157.765046 s : WilsonFermion5D StencilOdd
Grid : Message : 157.765049 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 157.765051 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 157.765053 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 157.783731 s : r_e6.02106
Grid : Message : 157.786036 s : r_o6.0211
Grid : Message : 157.787470 s : res12.0422
Grid : Message : 157.905573 s : norm diff 0
Grid : Message : 158.337590 s : norm diff even 0
Grid : Message : 158.959010 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1050
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1050
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:46:27 BST 2022
epoch 1661024788

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:55:03 BST 2022
epoch 1661025303

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffd9b1d1000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a2805dc000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a280214000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a27fd22000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a27f9f8000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a27f717000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a27f4b6000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a280563000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a27f0d6000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a27d97a000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a27d5aa000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a27d309000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a27d1de000)
libm.so.6 => /lib64/libm.so.6 (0x000014a27ce5c000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a27cc25000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a27ca0d000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a27c7ed000)
libc.so.6 => /lib64/libc.so.6 (0x000014a27c428000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014a27c224000)
/lib64/ld-linux-x86-64.so.2 (0x000014a28042c000)
librt.so.1 => /lib64/librt.so.1 (0x000014a27c01c000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a280497000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a280492000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a27bf10000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a27bd06000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014a27bb02000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150120000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.428183 s : Grid Layout
Grid : Message : 1.428187 s : Global lattice size : 48 48 48 96
Grid : Message : 1.428193 s : OpenMP threads : 4
Grid : Message : 1.428196 s : MPI tasks : 2 2 2 8
Grid : Message : 1.443217 s : Making s innermost grids
Grid : Message : 1.455165 s : Initialising 4d RNG
Grid : Message : 1.471981 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.472007 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.853366 s : Initialising 5d RNG
Grid : Message : 2.875960 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.876470 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.305707 s : Initialised RNGs
Grid : Message : 8.397843 s : Drawing gauge field
Grid : Message : 8.484443 s : Random gauge initialised
Grid : Message : 8.488387 s : Setting up Cshift based reference
Grid : Message : 13.563627 s : *****************************************************************
Grid : Message : 13.563653 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.563655 s : *****************************************************************
Grid : Message : 13.563658 s : *****************************************************************
Grid : Message : 13.563659 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.563660 s : * Vectorising space-time by 8
Grid : Message : 13.563663 s : * VComplexF size is 64 B
Grid : Message : 13.563665 s : * SINGLE precision
Grid : Message : 13.563667 s : * Using Overlapped Comms/Compute
Grid : Message : 13.563668 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.563669 s : *****************************************************************
Grid : Message : 14.958310 s : Called warmup
Grid : Message : 101.445133 s : Called Dw 30000 times in 8.73489e+07 us
Grid : Message : 101.445198 s : mflop/s = 7.7011e+07
Grid : Message : 101.445200 s : mflop/s per rank = 1.2033e+06
Grid : Message : 101.445202 s : mflop/s per node = 4.81319e+06
Grid : Message : 101.445204 s : RF GiB/s (base 2) = 156485
Grid : Message : 101.445206 s : mem GiB/s (base 2) = 97802.9
Grid : Message : 101.445777 s : norm diff 1.05775e-13
Grid : Message : 101.455931 s : #### Dhop calls report
Grid : Message : 101.455939 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 101.455943 s : WilsonFermion5D TotalTime /Calls : 1457.12 us
Grid : Message : 101.455945 s : WilsonFermion5D CommTime /Calls : 1014.92 us
Grid : Message : 101.455947 s : WilsonFermion5D FaceTime /Calls : 219.441 us
Grid : Message : 101.455949 s : WilsonFermion5D ComputeTime1/Calls : 2.84344 us
Grid : Message : 101.455951 s : WilsonFermion5D ComputeTime2/Calls : 235.367 us
Grid : Message : 101.455978 s : Average mflops/s per call : 3.61947e+10
Grid : Message : 101.455982 s : Average mflops/s per call per rank : 5.65543e+08
Grid : Message : 101.455984 s : Average mflops/s per call per node : 2.26217e+09
Grid : Message : 101.455986 s : Average mflops/s per call (full) : 7.83407e+07
Grid : Message : 101.455990 s : Average mflops/s per call per rank (full): 1.22407e+06
Grid : Message : 101.455992 s : Average mflops/s per call per node (full): 4.8963e+06
Grid : Message : 101.455994 s : WilsonFermion5D Stencil
Grid : Message : 101.455995 s : WilsonFermion5D StencilEven
Grid : Message : 101.455999 s : WilsonFermion5D StencilOdd
Grid : Message : 101.456001 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 101.456002 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 101.456004 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 110.188024 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 110.188051 s : Called DwDag
Grid : Message : 110.188052 s : norm dag result 12.0422
Grid : Message : 110.200211 s : norm dag ref 12.0422
Grid : Message : 110.203215 s : norm dag diff 7.28899e-14
Grid : Message : 110.213199 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 110.281787 s : src_e0.5
Grid : Message : 110.353808 s : src_o0.5
Grid : Message : 110.370985 s : *********************************************************
Grid : Message : 110.370991 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 110.370992 s : * Vectorising space-time by 8
Grid : Message : 110.370995 s : * SINGLE precision
Grid : Message : 110.370997 s : * Using Overlapped Comms/Compute
Grid : Message : 110.370998 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 110.371000 s : *********************************************************
Grid : Message : 157.314519 s : Deo mflop/s = 7.16631e+07
Grid : Message : 157.314545 s : Deo mflop/s per rank 1.11974e+06
Grid : Message : 157.314547 s : Deo mflop/s per node 4.47894e+06
Grid : Message : 157.314550 s : #### Dhop calls report
Grid : Message : 157.314552 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 157.314554 s : WilsonFermion5D TotalTime /Calls : 1564.64 us
Grid : Message : 157.314556 s : WilsonFermion5D CommTime /Calls : 1060.37 us
Grid : Message : 157.314558 s : WilsonFermion5D FaceTime /Calls : 287.98 us
Grid : Message : 157.314560 s : WilsonFermion5D ComputeTime1/Calls : 4.91794 us
Grid : Message : 157.314562 s : WilsonFermion5D ComputeTime2/Calls : 239.551 us
Grid : Message : 157.314587 s : Average mflops/s per call : 2.07265e+10
Grid : Message : 157.314591 s : Average mflops/s per call per rank : 3.23852e+08
Grid : Message : 157.314593 s : Average mflops/s per call per node : 1.29541e+09
Grid : Message : 157.314596 s : Average mflops/s per call (full) : 7.29577e+07
Grid : Message : 157.314600 s : Average mflops/s per call per rank (full): 1.13996e+06
Grid : Message : 157.314602 s : Average mflops/s per call per node (full): 4.55985e+06
Grid : Message : 157.314605 s : WilsonFermion5D Stencil
Grid : Message : 157.314606 s : WilsonFermion5D StencilEven
Grid : Message : 157.314608 s : WilsonFermion5D StencilOdd
Grid : Message : 157.314610 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 157.314613 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 157.314614 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 157.334523 s : r_e6.02106
Grid : Message : 157.336050 s : r_o6.0211
Grid : Message : 157.337424 s : res12.0422
Grid : Message : 157.450236 s : norm diff 0
Grid : Message : 157.586163 s : norm diff even 0
Grid : Message : 157.657558 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1065
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1065
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:52:16 BST 2022
epoch 1661025136

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:00:52 BST 2022
epoch 1661025652

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffceffcb000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c73048f000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c7300c7000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c72fbd5000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c72f8ab000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c72f5ca000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c72f369000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c730416000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c72ef89000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c72d82d000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c72d45d000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c72d1bc000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c72d091000)
libm.so.6 => /lib64/libm.so.6 (0x000014c72cd0f000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c72cad8000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c72c8c0000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c72c6a0000)
libc.so.6 => /lib64/libc.so.6 (0x000014c72c2db000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014c72c0d7000)
/lib64/ld-linux-x86-64.so.2 (0x000014c7302df000)
librt.so.1 => /lib64/librt.so.1 (0x000014c72becf000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c73034a000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c730345000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c72bdc3000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c72bbb9000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014c72b9b5000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1548a0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.498999 s : Grid Layout
Grid : Message : 1.499003 s : Global lattice size : 48 48 48 96
Grid : Message : 1.499009 s : OpenMP threads : 4
Grid : Message : 1.499010 s : MPI tasks : 2 2 2 8
Grid : Message : 1.516697 s : Making s innermost grids
Grid : Message : 1.528026 s : Initialising 4d RNG
Grid : Message : 1.543296 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.543322 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.803104 s : Initialising 5d RNG
Grid : Message : 2.280210 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.280810 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.463560 s : Initialised RNGs
Grid : Message : 8.316566 s : Drawing gauge field
Grid : Message : 8.441882 s : Random gauge initialised
Grid : Message : 8.454498 s : Setting up Cshift based reference
Grid : Message : 13.615874 s : *****************************************************************
Grid : Message : 13.615901 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.615903 s : *****************************************************************
Grid : Message : 13.615904 s : *****************************************************************
Grid : Message : 13.615905 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.615906 s : * Vectorising space-time by 8
Grid : Message : 13.615910 s : * VComplexF size is 64 B
Grid : Message : 13.615912 s : * SINGLE precision
Grid : Message : 13.615914 s : * Using Overlapped Comms/Compute
Grid : Message : 13.615916 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.615918 s : *****************************************************************
Grid : Message : 14.175758 s : Called warmup
Grid : Message : 100.948265 s : Called Dw 30000 times in 8.67724e+07 us
Grid : Message : 100.948328 s : mflop/s = 7.75226e+07
Grid : Message : 100.948330 s : mflop/s per rank = 1.21129e+06
Grid : Message : 100.948332 s : mflop/s per node = 4.84516e+06
Grid : Message : 100.948334 s : RF GiB/s (base 2) = 157524
Grid : Message : 100.948336 s : mem GiB/s (base 2) = 98452.5
Grid : Message : 100.948912 s : norm diff 1.05775e-13
Grid : Message : 100.958922 s : #### Dhop calls report
Grid : Message : 100.958930 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 100.958934 s : WilsonFermion5D TotalTime /Calls : 1447.35 us
Grid : Message : 100.958936 s : WilsonFermion5D CommTime /Calls : 1006.18 us
Grid : Message : 100.958938 s : WilsonFermion5D FaceTime /Calls : 218.625 us
Grid : Message : 100.958940 s : WilsonFermion5D ComputeTime1/Calls : 2.6472 us
Grid : Message : 100.958942 s : WilsonFermion5D ComputeTime2/Calls : 235.108 us
Grid : Message : 100.958970 s : Average mflops/s per call : 3.6261e+10
Grid : Message : 100.958974 s : Average mflops/s per call per rank : 5.66578e+08
Grid : Message : 100.958976 s : Average mflops/s per call per node : 2.26631e+09
Grid : Message : 100.958978 s : Average mflops/s per call (full) : 7.88698e+07
Grid : Message : 100.958981 s : Average mflops/s per call per rank (full): 1.23234e+06
Grid : Message : 100.958983 s : Average mflops/s per call per node (full): 4.92936e+06
Grid : Message : 100.958986 s : WilsonFermion5D Stencil
Grid : Message : 100.958987 s : WilsonFermion5D StencilEven
Grid : Message : 100.958988 s : WilsonFermion5D StencilOdd
Grid : Message : 100.958991 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 100.958992 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 100.958995 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 109.635912 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 109.635940 s : Called DwDag
Grid : Message : 109.635941 s : norm dag result 12.0422
Grid : Message : 109.641498 s : norm dag ref 12.0422
Grid : Message : 109.644623 s : norm dag diff 7.28899e-14
Grid : Message : 109.654599 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 109.718075 s : src_e0.5
Grid : Message : 109.790285 s : src_o0.5
Grid : Message : 109.807211 s : *********************************************************
Grid : Message : 109.807217 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 109.807219 s : * Vectorising space-time by 8
Grid : Message : 109.807221 s : * SINGLE precision
Grid : Message : 109.807224 s : * Using Overlapped Comms/Compute
Grid : Message : 109.807225 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 109.807226 s : *********************************************************
Grid : Message : 156.357075 s : Deo mflop/s = 7.22704e+07
Grid : Message : 156.357109 s : Deo mflop/s per rank 1.12923e+06
Grid : Message : 156.357111 s : Deo mflop/s per node 4.5169e+06
Grid : Message : 156.357114 s : #### Dhop calls report
Grid : Message : 156.357116 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 156.357118 s : WilsonFermion5D TotalTime /Calls : 1551.51 us
Grid : Message : 156.357120 s : WilsonFermion5D CommTime /Calls : 1049.38 us
Grid : Message : 156.357122 s : WilsonFermion5D FaceTime /Calls : 285.792 us
Grid : Message : 156.357124 s : WilsonFermion5D ComputeTime1/Calls : 4.81357 us
Grid : Message : 156.357126 s : WilsonFermion5D ComputeTime2/Calls : 239.16 us
Grid : Message : 156.357146 s : Average mflops/s per call : 2.07719e+10
Grid : Message : 156.357150 s : Average mflops/s per call per rank : 3.24561e+08
Grid : Message : 156.357152 s : Average mflops/s per call per node : 1.29824e+09
Grid : Message : 156.357154 s : Average mflops/s per call (full) : 7.35747e+07
Grid : Message : 156.357158 s : Average mflops/s per call per rank (full): 1.1496e+06
Grid : Message : 156.357161 s : Average mflops/s per call per node (full): 4.59842e+06
Grid : Message : 156.357163 s : WilsonFermion5D Stencil
Grid : Message : 156.357165 s : WilsonFermion5D StencilEven
Grid : Message : 156.357166 s : WilsonFermion5D StencilOdd
Grid : Message : 156.357168 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 156.357175 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 156.357176 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 156.375718 s : r_e6.02106
Grid : Message : 156.378883 s : r_o6.0211
Grid : Message : 156.380335 s : res12.0422
Grid : Message : 156.489162 s : norm diff 0
Grid : Message : 156.617774 s : norm diff even 0
Grid : Message : 156.694536 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1080
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1080
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 20:58:06 BST 2022
epoch 1661025486

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:06:38 BST 2022
epoch 1661025998

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc219f0000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014aa89605000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014aa8923d000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014aa88d4b000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014aa88a21000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014aa88740000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014aa884df000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014aa8958c000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014aa880ff000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014aa869a3000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014aa865d3000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014aa86332000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014aa86207000)
libm.so.6 => /lib64/libm.so.6 (0x000014aa85e85000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014aa85c4e000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014aa85a36000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014aa85816000)
libc.so.6 => /lib64/libc.so.6 (0x000014aa85451000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014aa8524d000)
/lib64/ld-linux-x86-64.so.2 (0x000014aa89455000)
librt.so.1 => /lib64/librt.so.1 (0x000014aa85045000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014aa894c0000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014aa894bb000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014aa84f39000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014aa84d2f000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014aa84b2b000)

View File

@ -0,0 +1,286 @@
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d00000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.412895 s : Grid Layout
Grid : Message : 1.412899 s : Global lattice size : 48 48 48 96
Grid : Message : 1.412905 s : OpenMP threads : 4
Grid : Message : 1.412909 s : MPI tasks : 2 2 2 8
Grid : Message : 1.428319 s : Making s innermost grids
Grid : Message : 1.445373 s : Initialising 4d RNG
Grid : Message : 1.461658 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.461680 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.902912 s : Initialising 5d RNG
Grid : Message : 2.141255 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.141291 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.353326 s : Initialised RNGs
Grid : Message : 8.518633 s : Drawing gauge field
Grid : Message : 8.626652 s : Random gauge initialised
Grid : Message : 8.630634 s : Setting up Cshift based reference
Grid : Message : 13.722925 s : *****************************************************************
Grid : Message : 13.722949 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.722950 s : *****************************************************************
Grid : Message : 13.722951 s : *****************************************************************
Grid : Message : 13.722952 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.722953 s : * Vectorising space-time by 8
Grid : Message : 13.722954 s : * VComplexF size is 64 B
Grid : Message : 13.722955 s : * SINGLE precision
Grid : Message : 13.722956 s : * Using Overlapped Comms/Compute
Grid : Message : 13.722957 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.722958 s : *****************************************************************
Grid : Message : 14.254628 s : Called warmup
Grid : Message : 100.327406 s : Called Dw 30000 times in 8.60725e+07 us
Grid : Message : 100.327470 s : mflop/s = 7.8153e+07
Grid : Message : 100.327472 s : mflop/s per rank = 1.22114e+06
Grid : Message : 100.327474 s : mflop/s per node = 4.88456e+06
Grid : Message : 100.327476 s : RF GiB/s (base 2) = 158805
Grid : Message : 100.327478 s : mem GiB/s (base 2) = 99253.2
Grid : Message : 100.328051 s : norm diff 1.05775e-13
Grid : Message : 100.337927 s : #### Dhop calls report
Grid : Message : 100.337935 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 100.337943 s : WilsonFermion5D TotalTime /Calls : 1435.69 us
Grid : Message : 100.337946 s : WilsonFermion5D CommTime /Calls : 996.547 us
Grid : Message : 100.337949 s : WilsonFermion5D FaceTime /Calls : 217.079 us
Grid : Message : 100.337953 s : WilsonFermion5D ComputeTime1/Calls : 2.78067 us
Grid : Message : 100.337955 s : WilsonFermion5D ComputeTime2/Calls : 234.472 us
Grid : Message : 100.337971 s : Average mflops/s per call : 3.63872e+10
Grid : Message : 100.337974 s : Average mflops/s per call per rank : 5.68549e+08
Grid : Message : 100.337976 s : Average mflops/s per call per node : 2.2742e+09
Grid : Message : 100.337980 s : Average mflops/s per call (full) : 7.95104e+07
Grid : Message : 100.337982 s : Average mflops/s per call per rank (full): 1.24235e+06
Grid : Message : 100.337986 s : Average mflops/s per call per node (full): 4.9694e+06
Grid : Message : 100.337988 s : WilsonFermion5D Stencil
Grid : Message : 100.337990 s : WilsonFermion5D StencilEven
Grid : Message : 100.337992 s : WilsonFermion5D StencilOdd
Grid : Message : 100.337995 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 100.337998 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 100.338000 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 109.354730 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 109.355200 s : Called DwDag
Grid : Message : 109.355210 s : norm dag result 12.0422
Grid : Message : 109.404420 s : norm dag ref 12.0422
Grid : Message : 109.435430 s : norm dag diff 7.28899e-14
Grid : Message : 109.565940 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 109.123204 s : src_e0.5
Grid : Message : 109.194082 s : src_o0.5
Grid : Message : 109.211743 s : *********************************************************
Grid : Message : 109.211749 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 109.211751 s : * Vectorising space-time by 8
Grid : Message : 109.211754 s : * SINGLE precision
Grid : Message : 109.211756 s : * Using Overlapped Comms/Compute
Grid : Message : 109.211759 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 109.211761 s : *********************************************************
Grid : Message : 155.351395 s : Deo mflop/s = 7.29132e+07
Grid : Message : 155.351424 s : Deo mflop/s per rank 1.13927e+06
Grid : Message : 155.351427 s : Deo mflop/s per node 4.55708e+06
Grid : Message : 155.351433 s : #### Dhop calls report
Grid : Message : 155.351436 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 155.351440 s : WilsonFermion5D TotalTime /Calls : 1537.8 us
Grid : Message : 155.351445 s : WilsonFermion5D CommTime /Calls : 1037.77 us
Grid : Message : 155.351449 s : WilsonFermion5D FaceTime /Calls : 285.044 us
Grid : Message : 155.351453 s : WilsonFermion5D ComputeTime1/Calls : 4.8771 us
Grid : Message : 155.351457 s : WilsonFermion5D ComputeTime2/Calls : 237.861 us
Grid : Message : 155.351481 s : Average mflops/s per call : 2.07287e+10
Grid : Message : 155.351485 s : Average mflops/s per call per rank : 3.23886e+08
Grid : Message : 155.351488 s : Average mflops/s per call per node : 1.29554e+09
Grid : Message : 155.351492 s : Average mflops/s per call (full) : 7.42306e+07
Grid : Message : 155.351496 s : Average mflops/s per call per rank (full): 1.15985e+06
Grid : Message : 155.351500 s : Average mflops/s per call per node (full): 4.63942e+06
Grid : Message : 155.351504 s : WilsonFermion5D Stencil
Grid : Message : 155.351506 s : WilsonFermion5D StencilEven
Grid : Message : 155.351508 s : WilsonFermion5D StencilOdd
Grid : Message : 155.351511 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 155.351513 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 155.351515 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 155.370290 s : r_e6.02106
Grid : Message : 155.372244 s : r_o6.0211
Grid : Message : 155.373660 s : res12.0422
Grid : Message : 155.495172 s : norm diff 0
Grid : Message : 155.622362 s : norm diff even 0
Grid : Message : 155.695812 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1095
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1095
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:03:53 BST 2022
epoch 1661025833

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:12:23 BST 2022
epoch 1661026343

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffdef5db000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000152bce209000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000152bcde41000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000152bcd94f000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000152bcd625000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000152bcd344000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000152bcd0e3000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000152bce190000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000152bccd03000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152bcb5a7000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000152bcb1d7000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152bcaf36000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000152bcae0b000)
libm.so.6 => /lib64/libm.so.6 (0x0000152bcaa89000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152bca852000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152bca63a000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152bca41a000)
libc.so.6 => /lib64/libc.so.6 (0x0000152bca055000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000152bc9e51000)
/lib64/ld-linux-x86-64.so.2 (0x0000152bce059000)
librt.so.1 => /lib64/librt.so.1 (0x0000152bc9c49000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000152bce0c4000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000152bce0bf000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000152bc9b3d000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152bc9933000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000152bc972f000)

View File

@ -0,0 +1,286 @@
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 1 device 0 bus id: 0000:44:00.0
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 2 device 0 bus id: 0000:84:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147320000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.574553 s : Grid Layout
Grid : Message : 1.574555 s : Global lattice size : 48 48 48 96
Grid : Message : 1.574559 s : OpenMP threads : 4
Grid : Message : 1.574561 s : MPI tasks : 2 2 2 8
Grid : Message : 1.590560 s : Making s innermost grids
Grid : Message : 1.602336 s : Initialising 4d RNG
Grid : Message : 1.619266 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.619291 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.883640 s : Initialising 5d RNG
Grid : Message : 2.117383 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.117419 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.594282 s : Initialised RNGs
Grid : Message : 8.809615 s : Drawing gauge field
Grid : Message : 8.954788 s : Random gauge initialised
Grid : Message : 8.965668 s : Setting up Cshift based reference
Grid : Message : 13.965128 s : *****************************************************************
Grid : Message : 13.965152 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.965153 s : *****************************************************************
Grid : Message : 13.965154 s : *****************************************************************
Grid : Message : 13.965155 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.965156 s : * Vectorising space-time by 8
Grid : Message : 13.965157 s : * VComplexF size is 64 B
Grid : Message : 13.965159 s : * SINGLE precision
Grid : Message : 13.965160 s : * Using Overlapped Comms/Compute
Grid : Message : 13.965161 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.965162 s : *****************************************************************
Grid : Message : 14.515202 s : Called warmup
Grid : Message : 99.730150 s : Called Dw 30000 times in 8.52149e+07 us
Grid : Message : 99.730204 s : mflop/s = 7.89395e+07
Grid : Message : 99.730206 s : mflop/s per rank = 1.23343e+06
Grid : Message : 99.730208 s : mflop/s per node = 4.93372e+06
Grid : Message : 99.730210 s : RF GiB/s (base 2) = 160403
Grid : Message : 99.730212 s : mem GiB/s (base 2) = 100252
Grid : Message : 99.730784 s : norm diff 1.05775e-13
Grid : Message : 99.740621 s : #### Dhop calls report
Grid : Message : 99.740628 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 99.740631 s : WilsonFermion5D TotalTime /Calls : 1421.72 us
Grid : Message : 99.740633 s : WilsonFermion5D CommTime /Calls : 984.801 us
Grid : Message : 99.740635 s : WilsonFermion5D FaceTime /Calls : 215.72 us
Grid : Message : 99.740637 s : WilsonFermion5D ComputeTime1/Calls : 2.65594 us
Grid : Message : 99.740639 s : WilsonFermion5D ComputeTime2/Calls : 233.727 us
Grid : Message : 99.740655 s : Average mflops/s per call : 3.59268e+10
Grid : Message : 99.740658 s : Average mflops/s per call per rank : 5.61356e+08
Grid : Message : 99.740660 s : Average mflops/s per call per node : 2.24542e+09
Grid : Message : 99.740662 s : Average mflops/s per call (full) : 8.02916e+07
Grid : Message : 99.740665 s : Average mflops/s per call per rank (full): 1.25456e+06
Grid : Message : 99.740667 s : Average mflops/s per call per node (full): 5.01823e+06
Grid : Message : 99.740669 s : WilsonFermion5D Stencil
Grid : Message : 99.740670 s : WilsonFermion5D StencilEven
Grid : Message : 99.740672 s : WilsonFermion5D StencilOdd
Grid : Message : 99.740673 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 99.740675 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 99.740679 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 108.466783 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 108.466816 s : Called DwDag
Grid : Message : 108.466817 s : norm dag result 12.0422
Grid : Message : 108.470193 s : norm dag ref 12.0422
Grid : Message : 108.473428 s : norm dag diff 7.28899e-14
Grid : Message : 108.486838 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 108.550312 s : src_e0.5
Grid : Message : 108.623836 s : src_o0.5
Grid : Message : 108.640541 s : *********************************************************
Grid : Message : 108.640545 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 108.640546 s : * Vectorising space-time by 8
Grid : Message : 108.640548 s : * SINGLE precision
Grid : Message : 108.640553 s : * Using Overlapped Comms/Compute
Grid : Message : 108.640555 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 108.640556 s : *********************************************************
Grid : Message : 154.233908 s : Deo mflop/s = 7.37872e+07
Grid : Message : 154.233941 s : Deo mflop/s per rank 1.15293e+06
Grid : Message : 154.233943 s : Deo mflop/s per node 4.6117e+06
Grid : Message : 154.233946 s : #### Dhop calls report
Grid : Message : 154.233948 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 154.233950 s : WilsonFermion5D TotalTime /Calls : 1519.59 us
Grid : Message : 154.233952 s : WilsonFermion5D CommTime /Calls : 1019.64 us
Grid : Message : 154.233954 s : WilsonFermion5D FaceTime /Calls : 288.201 us
Grid : Message : 154.233956 s : WilsonFermion5D ComputeTime1/Calls : 4.91837 us
Grid : Message : 154.233958 s : WilsonFermion5D ComputeTime2/Calls : 236.348 us
Grid : Message : 154.233977 s : Average mflops/s per call : 2.07539e+10
Grid : Message : 154.233980 s : Average mflops/s per call per rank : 3.24279e+08
Grid : Message : 154.233982 s : Average mflops/s per call per node : 1.29712e+09
Grid : Message : 154.233984 s : Average mflops/s per call (full) : 7.51203e+07
Grid : Message : 154.233986 s : Average mflops/s per call per rank (full): 1.17375e+06
Grid : Message : 154.233988 s : Average mflops/s per call per node (full): 4.69502e+06
Grid : Message : 154.233991 s : WilsonFermion5D Stencil
Grid : Message : 154.233992 s : WilsonFermion5D StencilEven
Grid : Message : 154.233993 s : WilsonFermion5D StencilOdd
Grid : Message : 154.233994 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 154.233995 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 154.233996 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 154.253979 s : r_e6.02106
Grid : Message : 154.255883 s : r_o6.0211
Grid : Message : 154.257289 s : res12.0422
Grid : Message : 154.364123 s : norm diff 0
Grid : Message : 154.496590 s : norm diff even 0
Grid : Message : 154.572879 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1110
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1110
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:09:39 BST 2022
epoch 1661026179

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:18:10 BST 2022
epoch 1661026690

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe04b26000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ffbc78a000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ffbc3c2000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ffbbed0000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ffbbba6000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ffbb8c5000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ffbb664000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ffbc711000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ffbb284000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ffb9b28000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ffb9758000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ffb94b7000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ffb938c000)
libm.so.6 => /lib64/libm.so.6 (0x000014ffb900a000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ffb8dd3000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ffb8bbb000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ffb899b000)
libc.so.6 => /lib64/libc.so.6 (0x000014ffb85d6000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014ffb83d2000)
/lib64/ld-linux-x86-64.so.2 (0x000014ffbc5da000)
librt.so.1 => /lib64/librt.so.1 (0x000014ffb81ca000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ffbc645000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ffbc640000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ffb80be000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ffb7eb4000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014ffb7cb0000)

View File

@ -0,0 +1,286 @@
tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 64
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146500000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.503072 s : Grid Layout
Grid : Message : 1.503076 s : Global lattice size : 48 48 48 96
Grid : Message : 1.503081 s : OpenMP threads : 4
Grid : Message : 1.503083 s : MPI tasks : 2 2 2 8
Grid : Message : 1.518479 s : Making s innermost grids
Grid : Message : 1.535611 s : Initialising 4d RNG
Grid : Message : 1.551229 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.551252 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 1.805667 s : Initialising 5d RNG
Grid : Message : 2.356490 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 2.357030 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 7.303785 s : Initialised RNGs
Grid : Message : 8.385261 s : Drawing gauge field
Grid : Message : 8.496485 s : Random gauge initialised
Grid : Message : 8.509783 s : Setting up Cshift based reference
Grid : Message : 13.609539 s : *****************************************************************
Grid : Message : 13.609564 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 13.609566 s : *****************************************************************
Grid : Message : 13.609568 s : *****************************************************************
Grid : Message : 13.609573 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 13.609575 s : * Vectorising space-time by 8
Grid : Message : 13.609577 s : * VComplexF size is 64 B
Grid : Message : 13.609579 s : * SINGLE precision
Grid : Message : 13.609582 s : * Using Overlapped Comms/Compute
Grid : Message : 13.609584 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 13.609586 s : *****************************************************************
Grid : Message : 14.155991 s : Called warmup
Grid : Message : 98.420612 s : Called Dw 30000 times in 8.42644e+07 us
Grid : Message : 98.420675 s : mflop/s = 7.983e+07
Grid : Message : 98.420677 s : mflop/s per rank = 1.24734e+06
Grid : Message : 98.420679 s : mflop/s per node = 4.98937e+06
Grid : Message : 98.420681 s : RF GiB/s (base 2) = 162213
Grid : Message : 98.420683 s : mem GiB/s (base 2) = 101383
Grid : Message : 98.421254 s : norm diff 1.05775e-13
Grid : Message : 98.431170 s : #### Dhop calls report
Grid : Message : 98.431178 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 98.431182 s : WilsonFermion5D TotalTime /Calls : 1405.63 us
Grid : Message : 98.431184 s : WilsonFermion5D CommTime /Calls : 961.451 us
Grid : Message : 98.431186 s : WilsonFermion5D FaceTime /Calls : 222.433 us
Grid : Message : 98.431188 s : WilsonFermion5D ComputeTime1/Calls : 2.80214 us
Grid : Message : 98.431190 s : WilsonFermion5D ComputeTime2/Calls : 234.1 us
Grid : Message : 98.431212 s : Average mflops/s per call : 3.60793e+10
Grid : Message : 98.431216 s : Average mflops/s per call per rank : 5.63738e+08
Grid : Message : 98.431218 s : Average mflops/s per call per node : 2.25495e+09
Grid : Message : 98.431220 s : Average mflops/s per call (full) : 8.12107e+07
Grid : Message : 98.431224 s : Average mflops/s per call per rank (full): 1.26892e+06
Grid : Message : 98.431226 s : Average mflops/s per call per node (full): 5.07567e+06
Grid : Message : 98.431229 s : WilsonFermion5D Stencil
Grid : Message : 98.431230 s : WilsonFermion5D StencilEven
Grid : Message : 98.431235 s : WilsonFermion5D StencilOdd
Grid : Message : 98.431239 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 98.431240 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 98.431241 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 107.161203 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 107.161230 s : Called DwDag
Grid : Message : 107.161231 s : norm dag result 12.0422
Grid : Message : 107.163717 s : norm dag ref 12.0422
Grid : Message : 107.166717 s : norm dag diff 7.28899e-14
Grid : Message : 107.181064 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 107.248613 s : src_e0.5
Grid : Message : 107.314227 s : src_o0.5
Grid : Message : 107.331787 s : *********************************************************
Grid : Message : 107.331790 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 107.331792 s : * Vectorising space-time by 8
Grid : Message : 107.331794 s : * SINGLE precision
Grid : Message : 107.331795 s : * Using Overlapped Comms/Compute
Grid : Message : 107.331796 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 107.331797 s : *********************************************************
Grid : Message : 152.337360 s : Deo mflop/s = 7.47496e+07
Grid : Message : 152.337387 s : Deo mflop/s per rank 1.16796e+06
Grid : Message : 152.337390 s : Deo mflop/s per node 4.67185e+06
Grid : Message : 152.337396 s : #### Dhop calls report
Grid : Message : 152.337399 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 152.337402 s : WilsonFermion5D TotalTime /Calls : 1500 us
Grid : Message : 152.337405 s : WilsonFermion5D CommTime /Calls : 1002.91 us
Grid : Message : 152.337408 s : WilsonFermion5D FaceTime /Calls : 282.963 us
Grid : Message : 152.337410 s : WilsonFermion5D ComputeTime1/Calls : 4.71911 us
Grid : Message : 152.337412 s : WilsonFermion5D ComputeTime2/Calls : 237.647 us
Grid : Message : 152.337435 s : Average mflops/s per call : 2.07759e+10
Grid : Message : 152.337439 s : Average mflops/s per call per rank : 3.24624e+08
Grid : Message : 152.337441 s : Average mflops/s per call per node : 1.29849e+09
Grid : Message : 152.337445 s : Average mflops/s per call (full) : 7.61013e+07
Grid : Message : 152.337448 s : Average mflops/s per call per rank (full): 1.18908e+06
Grid : Message : 152.337451 s : Average mflops/s per call per node (full): 4.75633e+06
Grid : Message : 152.337453 s : WilsonFermion5D Stencil
Grid : Message : 152.337456 s : WilsonFermion5D StencilEven
Grid : Message : 152.337457 s : WilsonFermion5D StencilOdd
Grid : Message : 152.337459 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 152.337462 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 152.337463 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 152.358219 s : r_e6.02106
Grid : Message : 152.359968 s : r_o6.0211
Grid : Message : 152.361373 s : res12.0422
Grid : Message : 152.467780 s : norm diff 0
Grid : Message : 152.609427 s : norm diff even 0
Grid : Message : 152.675745 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-16A-1125
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=16
#SBATCH --ntasks=64
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 16 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
freq=1125
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.8 \
--accelerator-threads 8 \
--grid 48.48.48.96 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:15:27 BST 2022
epoch 1661026527

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Sat Aug 20 21:23:53 BST 2022
epoch 1661027033

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More