Initial commit

This commit is contained in:
2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
numa=${lrank}
cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))"
places="$(( lrank*16 )):$(( (lrank+1)*16 ))"
BINDING="taskset -c ${cpus} numactl -m ${numa}"
export OMP_PLACES=${places}
echo "$(hostname) - ${lrank} binding='${BINDING}'"
${BINDING} "$@"

View File

@ -0,0 +1 @@
../dwf_fp32.tok

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
numa1=$(( 2 * lrank))
numa2=$(( 2 * lrank + 1 ))
netdev=mlx5_${lrank}:1
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
export UCX_NET_DEVICES=${netdev}
BINDING="--interleave=$numa1,$numa2"
echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
numactl ${BINDING} "$@"

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:04:27 BST 2022
epoch 1661216667

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc276d7000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015208b908000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015208b540000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015208b04e000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015208ad24000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015208aa43000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015208a7e2000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015208b88f000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015208a402000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152088ca6000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001520888d6000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152088635000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015208850a000)
libm.so.6 => /lib64/libm.so.6 (0x0000152088188000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152087f51000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152087d39000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152087b19000)
libc.so.6 => /lib64/libc.so.6 (0x0000152087754000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000152087550000)
/lib64/ld-linux-x86-64.so.2 (0x000015208b758000)
librt.so.1 => /lib64/librt.so.1 (0x0000152087348000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015208b7c3000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015208b7be000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015208723c000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152087032000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000152086e2e000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1520a0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.451132 s : Grid Layout
Grid : Message : 1.451136 s : Global lattice size : 64 64 64 128
Grid : Message : 1.451141 s : OpenMP threads : 4
Grid : Message : 1.451143 s : MPI tasks : 2 2 2 4
Grid : Message : 1.490207 s : Making s innermost grids
Grid : Message : 1.546698 s : Initialising 4d RNG
Grid : Message : 1.639951 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.639978 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.448983 s : Initialising 5d RNG
Grid : Message : 3.857910 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 3.857941 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.528379 s : Initialised RNGs
Grid : Message : 24.642198 s : Drawing gauge field
Grid : Message : 25.370279 s : Random gauge initialised
Grid : Message : 25.386364 s : Setting up Cshift based reference
Grid : Message : 54.680530 s : *****************************************************************
Grid : Message : 54.680554 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.680555 s : *****************************************************************
Grid : Message : 54.680556 s : *****************************************************************
Grid : Message : 54.680557 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.680558 s : * Vectorising space-time by 8
Grid : Message : 54.680559 s : * VComplexF size is 64 B
Grid : Message : 54.680560 s : * SINGLE precision
Grid : Message : 54.680563 s : * Using Overlapped Comms/Compute
Grid : Message : 54.680564 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.680565 s : *****************************************************************
Grid : Message : 56.720636 s : Called warmup
Grid : Message : 341.354661 s : Called Dw 30000 times in 2.84633e+08 us
Grid : Message : 341.354717 s : mflop/s = 7.46929e+07
Grid : Message : 341.354719 s : mflop/s per rank = 2.33415e+06
Grid : Message : 341.354721 s : mflop/s per node = 9.33662e+06
Grid : Message : 341.354723 s : RF GiB/s (base 2) = 151774
Grid : Message : 341.354725 s : mem GiB/s (base 2) = 94858.9
Grid : Message : 341.358222 s : norm diff 1.07359e-13
Grid : Message : 341.408574 s : #### Dhop calls report
Grid : Message : 341.408581 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 341.408584 s : WilsonFermion5D TotalTime /Calls : 4747.94 us
Grid : Message : 341.408586 s : WilsonFermion5D CommTime /Calls : 3238.54 us
Grid : Message : 341.408588 s : WilsonFermion5D FaceTime /Calls : 477.179 us
Grid : Message : 341.408590 s : WilsonFermion5D ComputeTime1/Calls : 5.20729 us
Grid : Message : 341.408592 s : WilsonFermion5D ComputeTime2/Calls : 1046.27 us
Grid : Message : 341.408654 s : Average mflops/s per call : 6.22503e+10
Grid : Message : 341.408657 s : Average mflops/s per call per rank : 1.94532e+09
Grid : Message : 341.408659 s : Average mflops/s per call per node : 7.78129e+09
Grid : Message : 341.408661 s : Average mflops/s per call (full) : 7.59861e+07
Grid : Message : 341.408664 s : Average mflops/s per call per rank (full): 2.37457e+06
Grid : Message : 341.408666 s : Average mflops/s per call per node (full): 9.49826e+06
Grid : Message : 341.408668 s : WilsonFermion5D Stencil
Grid : Message : 341.408669 s : WilsonFermion5D StencilEven
Grid : Message : 341.408672 s : WilsonFermion5D StencilOdd
Grid : Message : 341.408674 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 341.408676 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 341.408678 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 396.742581 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 396.742602 s : Called DwDag
Grid : Message : 396.742603 s : norm dag result 12.0421
Grid : Message : 396.756893 s : norm dag ref 12.0421
Grid : Message : 396.773260 s : norm dag diff 7.28475e-14
Grid : Message : 396.816075 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 397.193717 s : src_e0.499997
Grid : Message : 397.577696 s : src_o0.500003
Grid : Message : 397.675628 s : *********************************************************
Grid : Message : 397.675631 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 397.675632 s : * Vectorising space-time by 8
Grid : Message : 397.675633 s : * SINGLE precision
Grid : Message : 397.675634 s : * Using Overlapped Comms/Compute
Grid : Message : 397.675635 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 397.675636 s : *********************************************************
Grid : Message : 540.232805 s : Deo mflop/s = 7.45971e+07
Grid : Message : 540.232832 s : Deo mflop/s per rank 2.33116e+06
Grid : Message : 540.232834 s : Deo mflop/s per node 9.32463e+06
Grid : Message : 540.232837 s : #### Dhop calls report
Grid : Message : 540.232839 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 540.232841 s : WilsonFermion5D TotalTime /Calls : 4751.62 us
Grid : Message : 540.232843 s : WilsonFermion5D CommTime /Calls : 3173.44 us
Grid : Message : 540.232845 s : WilsonFermion5D FaceTime /Calls : 604.695 us
Grid : Message : 540.232847 s : WilsonFermion5D ComputeTime1/Calls : 6.28629 us
Grid : Message : 540.232849 s : WilsonFermion5D ComputeTime2/Calls : 999.947 us
Grid : Message : 540.232870 s : Average mflops/s per call : 5.14652e+10
Grid : Message : 540.232874 s : Average mflops/s per call per rank : 1.60829e+09
Grid : Message : 540.232878 s : Average mflops/s per call per node : 6.43315e+09
Grid : Message : 540.232881 s : Average mflops/s per call (full) : 7.59272e+07
Grid : Message : 540.232884 s : Average mflops/s per call per rank (full): 2.37273e+06
Grid : Message : 540.232887 s : Average mflops/s per call per node (full): 9.49091e+06
Grid : Message : 540.232890 s : WilsonFermion5D Stencil
Grid : Message : 540.232892 s : WilsonFermion5D StencilEven
Grid : Message : 540.232893 s : WilsonFermion5D StencilOdd
Grid : Message : 540.232896 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 540.232897 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 540.232900 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 540.304917 s : r_e6.02113
Grid : Message : 540.311850 s : r_o6.02101
Grid : Message : 540.318515 s : res12.0421
Grid : Message : 540.994922 s : norm diff 0
Grid : Message : 541.747359 s : norm diff even 0
Grid : Message : 542.139558 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1005
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1005
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 01:55:19 BST 2022
epoch 1661216119

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:17:01 BST 2022
epoch 1661217421

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe20397000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e4e5e5f000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e4e5a97000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e4e55a5000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e4e527b000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e4e4f9a000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e4e4d39000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e4e5de6000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e4e4959000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e4e31fd000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e4e2e2d000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e4e2b8c000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e4e2a61000)
libm.so.6 => /lib64/libm.so.6 (0x000014e4e26df000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e4e24a8000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e4e2290000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e4e2070000)
libc.so.6 => /lib64/libc.so.6 (0x000014e4e1cab000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014e4e1aa7000)
/lib64/ld-linux-x86-64.so.2 (0x000014e4e5caf000)
librt.so.1 => /lib64/librt.so.1 (0x000014e4e189f000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e4e5d1a000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e4e5d15000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e4e1793000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e4e1589000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014e4e1385000)

View File

@ -0,0 +1,254 @@
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x152d40000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.411137 s : Grid Layout
Grid : Message : 1.411142 s : Global lattice size : 64 64 64 128
Grid : Message : 1.411149 s : OpenMP threads : 4
Grid : Message : 1.411152 s : MPI tasks : 2 2 2 4
Grid : Message : 1.450334 s : Making s innermost grids
Grid : Message : 1.501343 s : Initialising 4d RNG
Grid : Message : 1.598884 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.598907 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.629236 s : Initialising 5d RNG
Grid : Message : 4.714710 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.715320 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.751504 s : Initialised RNGs
Grid : Message : 24.602581 s : Drawing gauge field
Grid : Message : 25.485290 s : Random gauge initialised
Grid : Message : 25.497324 s : Setting up Cshift based reference
Grid : Message : 54.590031 s : *****************************************************************
Grid : Message : 54.590055 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.590056 s : *****************************************************************
Grid : Message : 54.590057 s : *****************************************************************
Grid : Message : 54.590058 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.590059 s : * Vectorising space-time by 8
Grid : Message : 54.590060 s : * VComplexF size is 64 B
Grid : Message : 54.590061 s : * SINGLE precision
Grid : Message : 54.590063 s : * Using Overlapped Comms/Compute
Grid : Message : 54.590064 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.590065 s : *****************************************************************
Grid : Message : 56.600017 s : Called warmup
Grid : Message : 340.439124 s : Called Dw 30000 times in 2.83839e+08 us
Grid : Message : 340.439168 s : mflop/s = 7.4902e+07
Grid : Message : 340.439170 s : mflop/s per rank = 2.34069e+06
Grid : Message : 340.439172 s : mflop/s per node = 9.36276e+06
Grid : Message : 340.439174 s : RF GiB/s (base 2) = 152199
Grid : Message : 340.439176 s : mem GiB/s (base 2) = 95124.5
Grid : Message : 340.442672 s : norm diff 1.07359e-13
Grid : Message : 340.492982 s : #### Dhop calls report
Grid : Message : 340.492989 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 340.492992 s : WilsonFermion5D TotalTime /Calls : 4734.47 us
Grid : Message : 340.492994 s : WilsonFermion5D CommTime /Calls : 3215.16 us
Grid : Message : 340.492996 s : WilsonFermion5D FaceTime /Calls : 476.312 us
Grid : Message : 340.492998 s : WilsonFermion5D ComputeTime1/Calls : 4.61805 us
Grid : Message : 340.493000 s : WilsonFermion5D ComputeTime2/Calls : 1057.09 us
Grid : Message : 340.493034 s : Average mflops/s per call : 6.20648e+10
Grid : Message : 340.493039 s : Average mflops/s per call per rank : 1.93952e+09
Grid : Message : 340.493041 s : Average mflops/s per call per node : 7.75809e+09
Grid : Message : 340.493043 s : Average mflops/s per call (full) : 7.62022e+07
Grid : Message : 340.493050 s : Average mflops/s per call per rank (full): 2.38132e+06
Grid : Message : 340.493054 s : Average mflops/s per call per node (full): 9.52528e+06
Grid : Message : 340.493057 s : WilsonFermion5D Stencil
Grid : Message : 340.493059 s : WilsonFermion5D StencilEven
Grid : Message : 340.493061 s : WilsonFermion5D StencilOdd
Grid : Message : 340.493064 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 340.493066 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 340.493068 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 395.685600 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 395.685621 s : Called DwDag
Grid : Message : 395.685622 s : norm dag result 12.0421
Grid : Message : 395.711061 s : norm dag ref 12.0421
Grid : Message : 395.727365 s : norm dag diff 7.28475e-14
Grid : Message : 395.777073 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 396.215583 s : src_e0.499997
Grid : Message : 396.523749 s : src_o0.500003
Grid : Message : 396.640132 s : *********************************************************
Grid : Message : 396.640135 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 396.640136 s : * Vectorising space-time by 8
Grid : Message : 396.640137 s : * SINGLE precision
Grid : Message : 396.640138 s : * Using Overlapped Comms/Compute
Grid : Message : 396.640139 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 396.640140 s : *********************************************************
Grid : Message : 538.477450 s : Deo mflop/s = 7.52017e+07
Grid : Message : 538.477760 s : Deo mflop/s per rank 2.35005e+06
Grid : Message : 538.477780 s : Deo mflop/s per node 9.40022e+06
Grid : Message : 538.477810 s : #### Dhop calls report
Grid : Message : 538.477830 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 538.477850 s : WilsonFermion5D TotalTime /Calls : 4713.31 us
Grid : Message : 538.477870 s : WilsonFermion5D CommTime /Calls : 3138.8 us
Grid : Message : 538.477890 s : WilsonFermion5D FaceTime /Calls : 592.51 us
Grid : Message : 538.477910 s : WilsonFermion5D ComputeTime1/Calls : 5.73034 us
Grid : Message : 538.477930 s : WilsonFermion5D ComputeTime2/Calls : 1007.86 us
Grid : Message : 538.478120 s : Average mflops/s per call : 5.12899e+10
Grid : Message : 538.478160 s : Average mflops/s per call per rank : 1.60281e+09
Grid : Message : 538.478180 s : Average mflops/s per call per node : 6.41124e+09
Grid : Message : 538.478200 s : Average mflops/s per call (full) : 7.65444e+07
Grid : Message : 538.478240 s : Average mflops/s per call per rank (full): 2.39201e+06
Grid : Message : 538.478260 s : Average mflops/s per call per node (full): 9.56805e+06
Grid : Message : 538.478290 s : WilsonFermion5D Stencil
Grid : Message : 538.478300 s : WilsonFermion5D StencilEven
Grid : Message : 538.478320 s : WilsonFermion5D StencilOdd
Grid : Message : 538.478330 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 538.478350 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 538.478360 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 538.119186 s : r_e6.02113
Grid : Message : 538.127316 s : r_o6.02101
Grid : Message : 538.133936 s : res12.0421
Grid : Message : 538.790970 s : norm diff 0
Grid : Message : 539.605836 s : norm diff even 0
Grid : Message : 539.988598 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1020
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1020
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:07:55 BST 2022
epoch 1661216875

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:29:38 BST 2022
epoch 1661218178

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe2c559000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154b69193000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154b68dcb000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154b688d9000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154b685af000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154b682ce000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154b6806d000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154b6911a000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154b67c8d000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154b66531000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154b66161000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154b65ec0000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154b65d95000)
libm.so.6 => /lib64/libm.so.6 (0x0000154b65a13000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154b657dc000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154b655c4000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154b653a4000)
libc.so.6 => /lib64/libc.so.6 (0x0000154b64fdf000)
libdl.so.2 => /lib64/libdl.so.2 (0x0000154b64ddb000)
/lib64/ld-linux-x86-64.so.2 (0x0000154b68fe3000)
librt.so.1 => /lib64/librt.so.1 (0x0000154b64bd3000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154b6904e000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154b69049000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154b64ac7000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154b648bd000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000154b646b9000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e380000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.487038 s : Grid Layout
Grid : Message : 1.487042 s : Global lattice size : 64 64 64 128
Grid : Message : 1.487049 s : OpenMP threads : 4
Grid : Message : 1.487051 s : MPI tasks : 2 2 2 4
Grid : Message : 1.526579 s : Making s innermost grids
Grid : Message : 1.575050 s : Initialising 4d RNG
Grid : Message : 1.668970 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.668993 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.690908 s : Initialising 5d RNG
Grid : Message : 4.186060 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.186090 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 21.108778 s : Initialised RNGs
Grid : Message : 24.628384 s : Drawing gauge field
Grid : Message : 25.244935 s : Random gauge initialised
Grid : Message : 25.260871 s : Setting up Cshift based reference
Grid : Message : 54.297463 s : *****************************************************************
Grid : Message : 54.297490 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.297492 s : *****************************************************************
Grid : Message : 54.297493 s : *****************************************************************
Grid : Message : 54.297494 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.297495 s : * Vectorising space-time by 8
Grid : Message : 54.297496 s : * VComplexF size is 64 B
Grid : Message : 54.297498 s : * SINGLE precision
Grid : Message : 54.297500 s : * Using Overlapped Comms/Compute
Grid : Message : 54.297501 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.297502 s : *****************************************************************
Grid : Message : 56.289649 s : Called warmup
Grid : Message : 339.475576 s : Called Dw 30000 times in 2.83186e+08 us
Grid : Message : 339.475624 s : mflop/s = 7.50747e+07
Grid : Message : 339.475626 s : mflop/s per rank = 2.34608e+06
Grid : Message : 339.475628 s : mflop/s per node = 9.38434e+06
Grid : Message : 339.475630 s : RF GiB/s (base 2) = 152550
Grid : Message : 339.475632 s : mem GiB/s (base 2) = 95343.7
Grid : Message : 339.479133 s : norm diff 1.07359e-13
Grid : Message : 339.528508 s : #### Dhop calls report
Grid : Message : 339.528515 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 339.528519 s : WilsonFermion5D TotalTime /Calls : 4723.23 us
Grid : Message : 339.528521 s : WilsonFermion5D CommTime /Calls : 3196.3 us
Grid : Message : 339.528523 s : WilsonFermion5D FaceTime /Calls : 478.284 us
Grid : Message : 339.528525 s : WilsonFermion5D ComputeTime1/Calls : 4.58175 us
Grid : Message : 339.528527 s : WilsonFermion5D ComputeTime2/Calls : 1062.24 us
Grid : Message : 339.528552 s : Average mflops/s per call : 6.12426e+10
Grid : Message : 339.528556 s : Average mflops/s per call per rank : 1.91383e+09
Grid : Message : 339.528558 s : Average mflops/s per call per node : 7.65533e+09
Grid : Message : 339.528560 s : Average mflops/s per call (full) : 7.63836e+07
Grid : Message : 339.528564 s : Average mflops/s per call per rank (full): 2.38699e+06
Grid : Message : 339.528567 s : Average mflops/s per call per node (full): 9.54795e+06
Grid : Message : 339.528569 s : WilsonFermion5D Stencil
Grid : Message : 339.528572 s : WilsonFermion5D StencilEven
Grid : Message : 339.528575 s : WilsonFermion5D StencilOdd
Grid : Message : 339.528576 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 339.528578 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 339.528580 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 394.933228 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 394.933253 s : Called DwDag
Grid : Message : 394.933254 s : norm dag result 12.0421
Grid : Message : 394.953559 s : norm dag ref 12.0421
Grid : Message : 394.969769 s : norm dag diff 7.28475e-14
Grid : Message : 395.189670 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 395.356222 s : src_e0.499997
Grid : Message : 395.800392 s : src_o0.500003
Grid : Message : 395.896090 s : *********************************************************
Grid : Message : 395.896093 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 395.896094 s : * Vectorising space-time by 8
Grid : Message : 395.896096 s : * SINGLE precision
Grid : Message : 395.896097 s : * Using Overlapped Comms/Compute
Grid : Message : 395.896098 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 395.896099 s : *********************************************************
Grid : Message : 536.452166 s : Deo mflop/s = 7.56569e+07
Grid : Message : 536.452197 s : Deo mflop/s per rank 2.36428e+06
Grid : Message : 536.452199 s : Deo mflop/s per node 9.45711e+06
Grid : Message : 536.452202 s : #### Dhop calls report
Grid : Message : 536.452204 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 536.452206 s : WilsonFermion5D TotalTime /Calls : 4684.93 us
Grid : Message : 536.452208 s : WilsonFermion5D CommTime /Calls : 3112.34 us
Grid : Message : 536.452210 s : WilsonFermion5D FaceTime /Calls : 591.04 us
Grid : Message : 536.452212 s : WilsonFermion5D ComputeTime1/Calls : 5.70331 us
Grid : Message : 536.452214 s : WilsonFermion5D ComputeTime2/Calls : 1007.07 us
Grid : Message : 536.452239 s : Average mflops/s per call : 5.15026e+10
Grid : Message : 536.452243 s : Average mflops/s per call per rank : 1.60946e+09
Grid : Message : 536.452245 s : Average mflops/s per call per node : 6.43783e+09
Grid : Message : 536.452247 s : Average mflops/s per call (full) : 7.70081e+07
Grid : Message : 536.452252 s : Average mflops/s per call per rank (full): 2.4065e+06
Grid : Message : 536.452256 s : Average mflops/s per call per node (full): 9.62601e+06
Grid : Message : 536.452259 s : WilsonFermion5D Stencil
Grid : Message : 536.452262 s : WilsonFermion5D StencilEven
Grid : Message : 536.452264 s : WilsonFermion5D StencilOdd
Grid : Message : 536.452267 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 536.452270 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 536.452271 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 536.525206 s : r_e6.02113
Grid : Message : 536.532317 s : r_o6.02101
Grid : Message : 536.538894 s : res12.0421
Grid : Message : 537.137938 s : norm diff 0
Grid : Message : 537.903953 s : norm diff even 0
Grid : Message : 538.313669 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1035
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1035
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:20:33 BST 2022
epoch 1661217633

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:42:07 BST 2022
epoch 1661218927

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc35509000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001501d8950000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001501d8588000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001501d8096000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001501d7d6c000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001501d7a8b000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001501d782a000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001501d88d7000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001501d744a000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001501d5cee000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001501d591e000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001501d567d000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001501d5552000)
libm.so.6 => /lib64/libm.so.6 (0x00001501d51d0000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001501d4f99000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001501d4d81000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001501d4b61000)
libc.so.6 => /lib64/libc.so.6 (0x00001501d479c000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001501d4598000)
/lib64/ld-linux-x86-64.so.2 (0x00001501d87a0000)
librt.so.1 => /lib64/librt.so.1 (0x00001501d4390000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001501d880b000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001501d8806000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001501d4284000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001501d407a000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001501d3e76000)

View File

@ -0,0 +1,254 @@
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14aa80000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.383615 s : Grid Layout
Grid : Message : 1.383619 s : Global lattice size : 64 64 64 128
Grid : Message : 1.383627 s : OpenMP threads : 4
Grid : Message : 1.383630 s : MPI tasks : 2 2 2 4
Grid : Message : 1.426416 s : Making s innermost grids
Grid : Message : 1.472587 s : Initialising 4d RNG
Grid : Message : 1.567580 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.567607 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.379949 s : Initialising 5d RNG
Grid : Message : 3.819686 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 3.819712 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.471267 s : Initialised RNGs
Grid : Message : 25.497600 s : Drawing gauge field
Grid : Message : 25.826925 s : Random gauge initialised
Grid : Message : 25.842484 s : Setting up Cshift based reference
Grid : Message : 54.870795 s : *****************************************************************
Grid : Message : 54.870825 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.870827 s : *****************************************************************
Grid : Message : 54.870829 s : *****************************************************************
Grid : Message : 54.870830 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.870839 s : * Vectorising space-time by 8
Grid : Message : 54.870841 s : * VComplexF size is 64 B
Grid : Message : 54.870843 s : * SINGLE precision
Grid : Message : 54.870846 s : * Using Overlapped Comms/Compute
Grid : Message : 54.870848 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.870850 s : *****************************************************************
Grid : Message : 56.943818 s : Called warmup
Grid : Message : 339.527765 s : Called Dw 30000 times in 2.82583e+08 us
Grid : Message : 339.527813 s : mflop/s = 7.52349e+07
Grid : Message : 339.527816 s : mflop/s per rank = 2.35109e+06
Grid : Message : 339.527823 s : mflop/s per node = 9.40436e+06
Grid : Message : 339.527826 s : RF GiB/s (base 2) = 152876
Grid : Message : 339.527830 s : mem GiB/s (base 2) = 95547.2
Grid : Message : 339.531335 s : norm diff 1.07359e-13
Grid : Message : 339.580818 s : #### Dhop calls report
Grid : Message : 339.580824 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 339.580827 s : WilsonFermion5D TotalTime /Calls : 4713.67 us
Grid : Message : 339.580829 s : WilsonFermion5D CommTime /Calls : 3193.06 us
Grid : Message : 339.580831 s : WilsonFermion5D FaceTime /Calls : 475.013 us
Grid : Message : 339.580833 s : WilsonFermion5D ComputeTime1/Calls : 4.81574 us
Grid : Message : 339.580835 s : WilsonFermion5D ComputeTime2/Calls : 1059.46 us
Grid : Message : 339.580923 s : Average mflops/s per call : 6.07786e+10
Grid : Message : 339.580927 s : Average mflops/s per call per rank : 1.89933e+09
Grid : Message : 339.580929 s : Average mflops/s per call per node : 7.59733e+09
Grid : Message : 339.580931 s : Average mflops/s per call (full) : 7.65385e+07
Grid : Message : 339.580933 s : Average mflops/s per call per rank (full): 2.39183e+06
Grid : Message : 339.580937 s : Average mflops/s per call per node (full): 9.56731e+06
Grid : Message : 339.580940 s : WilsonFermion5D Stencil
Grid : Message : 339.580942 s : WilsonFermion5D StencilEven
Grid : Message : 339.580944 s : WilsonFermion5D StencilOdd
Grid : Message : 339.580945 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 339.580947 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 339.580949 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 394.987790 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 394.987814 s : Called DwDag
Grid : Message : 394.987815 s : norm dag result 12.0421
Grid : Message : 395.185510 s : norm dag ref 12.0421
Grid : Message : 395.346940 s : norm dag diff 7.28475e-14
Grid : Message : 395.773530 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 395.465746 s : src_e0.499997
Grid : Message : 395.917171 s : src_o0.500003
Grid : Message : 396.574200 s : *********************************************************
Grid : Message : 396.574240 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 396.574250 s : * Vectorising space-time by 8
Grid : Message : 396.574260 s : * SINGLE precision
Grid : Message : 396.574270 s : * Using Overlapped Comms/Compute
Grid : Message : 396.574280 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 396.574290 s : *********************************************************
Grid : Message : 535.834881 s : Deo mflop/s = 7.60799e+07
Grid : Message : 535.834913 s : Deo mflop/s per rank 2.3775e+06
Grid : Message : 535.834915 s : Deo mflop/s per node 9.50999e+06
Grid : Message : 535.834918 s : #### Dhop calls report
Grid : Message : 535.834920 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 535.834922 s : WilsonFermion5D TotalTime /Calls : 4658.97 us
Grid : Message : 535.834924 s : WilsonFermion5D CommTime /Calls : 3090.81 us
Grid : Message : 535.834930 s : WilsonFermion5D FaceTime /Calls : 583.833 us
Grid : Message : 535.834935 s : WilsonFermion5D ComputeTime1/Calls : 5.88087 us
Grid : Message : 535.834939 s : WilsonFermion5D ComputeTime2/Calls : 1011.38 us
Grid : Message : 535.834960 s : Average mflops/s per call : 5.08454e+10
Grid : Message : 535.834964 s : Average mflops/s per call per rank : 1.58892e+09
Grid : Message : 535.834966 s : Average mflops/s per call per node : 6.35567e+09
Grid : Message : 535.834969 s : Average mflops/s per call (full) : 7.74371e+07
Grid : Message : 535.834973 s : Average mflops/s per call per rank (full): 2.41991e+06
Grid : Message : 535.834975 s : Average mflops/s per call per node (full): 9.67963e+06
Grid : Message : 535.834978 s : WilsonFermion5D Stencil
Grid : Message : 535.834979 s : WilsonFermion5D StencilEven
Grid : Message : 535.834981 s : WilsonFermion5D StencilOdd
Grid : Message : 535.834983 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 535.834985 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 535.834988 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 535.907590 s : r_e6.02113
Grid : Message : 535.918975 s : r_o6.02101
Grid : Message : 535.925515 s : res12.0421
Grid : Message : 536.576844 s : norm diff 0
Grid : Message : 537.448681 s : norm diff even 0
Grid : Message : 537.774321 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1050
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1050
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:33:03 BST 2022
epoch 1661218383

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:54:33 BST 2022
epoch 1661219673

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc084f3000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e7b474a000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e7b4382000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e7b3e90000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e7b3b66000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e7b3885000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e7b3624000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e7b46d1000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e7b3244000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e7b1ae8000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e7b1718000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e7b1477000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e7b134c000)
libm.so.6 => /lib64/libm.so.6 (0x000014e7b0fca000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e7b0d93000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e7b0b7b000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e7b095b000)
libc.so.6 => /lib64/libc.so.6 (0x000014e7b0596000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014e7b0392000)
/lib64/ld-linux-x86-64.so.2 (0x000014e7b459a000)
librt.so.1 => /lib64/librt.so.1 (0x000014e7b018a000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e7b4605000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e7b4600000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e7b007e000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e7afe74000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014e7afc70000)

View File

@ -0,0 +1,254 @@
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14eea0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.366393 s : Grid Layout
Grid : Message : 1.366397 s : Global lattice size : 64 64 64 128
Grid : Message : 1.366402 s : OpenMP threads : 4
Grid : Message : 1.366404 s : MPI tasks : 2 2 2 4
Grid : Message : 1.407148 s : Making s innermost grids
Grid : Message : 1.464257 s : Initialising 4d RNG
Grid : Message : 1.557730 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.557758 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.356467 s : Initialising 5d RNG
Grid : Message : 3.801979 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 3.802012 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.483524 s : Initialised RNGs
Grid : Message : 24.230918 s : Drawing gauge field
Grid : Message : 25.177490 s : Random gauge initialised
Grid : Message : 25.295480 s : Setting up Cshift based reference
Grid : Message : 54.973180 s : *****************************************************************
Grid : Message : 54.973410 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.973430 s : *****************************************************************
Grid : Message : 54.973440 s : *****************************************************************
Grid : Message : 54.973490 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.973510 s : * Vectorising space-time by 8
Grid : Message : 54.973530 s : * VComplexF size is 64 B
Grid : Message : 54.973560 s : * SINGLE precision
Grid : Message : 54.973580 s : * Using Overlapped Comms/Compute
Grid : Message : 54.973600 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.973630 s : *****************************************************************
Grid : Message : 56.188907 s : Called warmup
Grid : Message : 338.221994 s : Called Dw 30000 times in 2.82032e+08 us
Grid : Message : 338.222041 s : mflop/s = 7.53818e+07
Grid : Message : 338.222043 s : mflop/s per rank = 2.35568e+06
Grid : Message : 338.222045 s : mflop/s per node = 9.42273e+06
Grid : Message : 338.222047 s : RF GiB/s (base 2) = 153174
Grid : Message : 338.222049 s : mem GiB/s (base 2) = 95733.8
Grid : Message : 338.225548 s : norm diff 1.07359e-13
Grid : Message : 338.275111 s : #### Dhop calls report
Grid : Message : 338.275118 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 338.275121 s : WilsonFermion5D TotalTime /Calls : 4704.45 us
Grid : Message : 338.275123 s : WilsonFermion5D CommTime /Calls : 3197.7 us
Grid : Message : 338.275125 s : WilsonFermion5D FaceTime /Calls : 471.81 us
Grid : Message : 338.275127 s : WilsonFermion5D ComputeTime1/Calls : 5.0956 us
Grid : Message : 338.275129 s : WilsonFermion5D ComputeTime2/Calls : 1048.58 us
Grid : Message : 338.275196 s : Average mflops/s per call : 6.11343e+10
Grid : Message : 338.275200 s : Average mflops/s per call per rank : 1.91045e+09
Grid : Message : 338.275202 s : Average mflops/s per call per node : 7.64179e+09
Grid : Message : 338.275204 s : Average mflops/s per call (full) : 7.66885e+07
Grid : Message : 338.275206 s : Average mflops/s per call per rank (full): 2.39652e+06
Grid : Message : 338.275208 s : Average mflops/s per call per node (full): 9.58606e+06
Grid : Message : 338.275211 s : WilsonFermion5D Stencil
Grid : Message : 338.275212 s : WilsonFermion5D StencilEven
Grid : Message : 338.275213 s : WilsonFermion5D StencilOdd
Grid : Message : 338.275214 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 338.275215 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 338.275216 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 393.586448 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 393.586467 s : Called DwDag
Grid : Message : 393.586468 s : norm dag result 12.0421
Grid : Message : 393.600340 s : norm dag ref 12.0421
Grid : Message : 393.616373 s : norm dag diff 7.28475e-14
Grid : Message : 393.662063 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 394.592240 s : src_e0.499997
Grid : Message : 394.431906 s : src_o0.500003
Grid : Message : 394.530690 s : *********************************************************
Grid : Message : 394.530693 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 394.530694 s : * Vectorising space-time by 8
Grid : Message : 394.530695 s : * SINGLE precision
Grid : Message : 394.530696 s : * Using Overlapped Comms/Compute
Grid : Message : 394.530697 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 394.530698 s : *********************************************************
Grid : Message : 534.917450 s : Deo mflop/s = 7.61957e+07
Grid : Message : 534.917770 s : Deo mflop/s per rank 2.38112e+06
Grid : Message : 534.917790 s : Deo mflop/s per node 9.52446e+06
Grid : Message : 534.917860 s : #### Dhop calls report
Grid : Message : 534.917900 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 534.917930 s : WilsonFermion5D TotalTime /Calls : 4651.72 us
Grid : Message : 534.917970 s : WilsonFermion5D CommTime /Calls : 3091.21 us
Grid : Message : 534.918000 s : WilsonFermion5D FaceTime /Calls : 583.213 us
Grid : Message : 534.918030 s : WilsonFermion5D ComputeTime1/Calls : 6.04677 us
Grid : Message : 534.918070 s : WilsonFermion5D ComputeTime2/Calls : 1003.12 us
Grid : Message : 534.918280 s : Average mflops/s per call : 5.15468e+10
Grid : Message : 534.918320 s : Average mflops/s per call per rank : 1.61084e+09
Grid : Message : 534.918350 s : Average mflops/s per call per node : 6.44335e+09
Grid : Message : 534.918380 s : Average mflops/s per call (full) : 7.75578e+07
Grid : Message : 534.918410 s : Average mflops/s per call per rank (full): 2.42368e+06
Grid : Message : 534.918450 s : Average mflops/s per call per node (full): 9.69473e+06
Grid : Message : 534.918480 s : WilsonFermion5D Stencil
Grid : Message : 534.918510 s : WilsonFermion5D StencilEven
Grid : Message : 534.918520 s : WilsonFermion5D StencilOdd
Grid : Message : 534.918540 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 534.918560 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 534.918590 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 534.162791 s : r_e6.02113
Grid : Message : 534.171110 s : r_o6.02101
Grid : Message : 534.177848 s : res12.0421
Grid : Message : 534.858243 s : norm diff 0
Grid : Message : 535.620756 s : norm diff even 0
Grid : Message : 536.317800 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1065
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1065
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:45:31 BST 2022
epoch 1661219131

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:07:02 BST 2022
epoch 1661220422

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffcea739000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000155068944000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015506857c000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015506808a000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000155067d60000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000155067a7f000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015506781e000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001550688cb000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015506743e000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x0000155065ce2000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000155065912000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000155065671000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000155065546000)
libm.so.6 => /lib64/libm.so.6 (0x00001550651c4000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000155064f8d000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000155064d75000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x0000155064b55000)
libc.so.6 => /lib64/libc.so.6 (0x0000155064790000)
libdl.so.2 => /lib64/libdl.so.2 (0x000015506458c000)
/lib64/ld-linux-x86-64.so.2 (0x0000155068794000)
librt.so.1 => /lib64/librt.so.1 (0x0000155064384000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001550687ff000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001550687fa000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000155064278000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015506406e000)
libutil.so.1 => /lib64/libutil.so.1 (0x0000155063e6a000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1508e0000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.278711 s : Grid Layout
Grid : Message : 1.278716 s : Global lattice size : 64 64 64 128
Grid : Message : 1.278724 s : OpenMP threads : 4
Grid : Message : 1.278728 s : MPI tasks : 2 2 2 4
Grid : Message : 1.317967 s : Making s innermost grids
Grid : Message : 1.383230 s : Initialising 4d RNG
Grid : Message : 1.475617 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.475643 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.275104 s : Initialising 5d RNG
Grid : Message : 3.714759 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 3.714789 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.388702 s : Initialised RNGs
Grid : Message : 24.670402 s : Drawing gauge field
Grid : Message : 25.467328 s : Random gauge initialised
Grid : Message : 25.482764 s : Setting up Cshift based reference
Grid : Message : 54.598912 s : *****************************************************************
Grid : Message : 54.598935 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.598936 s : *****************************************************************
Grid : Message : 54.598937 s : *****************************************************************
Grid : Message : 54.598938 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.598939 s : * Vectorising space-time by 8
Grid : Message : 54.598940 s : * VComplexF size is 64 B
Grid : Message : 54.598941 s : * SINGLE precision
Grid : Message : 54.598943 s : * Using Overlapped Comms/Compute
Grid : Message : 54.598944 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.598945 s : *****************************************************************
Grid : Message : 56.670371 s : Called warmup
Grid : Message : 338.287712 s : Called Dw 30000 times in 2.81616e+08 us
Grid : Message : 338.287768 s : mflop/s = 7.54932e+07
Grid : Message : 338.287770 s : mflop/s per rank = 2.35916e+06
Grid : Message : 338.287772 s : mflop/s per node = 9.43665e+06
Grid : Message : 338.287774 s : RF GiB/s (base 2) = 153400
Grid : Message : 338.287776 s : mem GiB/s (base 2) = 95875.3
Grid : Message : 338.291283 s : norm diff 1.07359e-13
Grid : Message : 338.340583 s : #### Dhop calls report
Grid : Message : 338.340590 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 338.340593 s : WilsonFermion5D TotalTime /Calls : 4697.65 us
Grid : Message : 338.340597 s : WilsonFermion5D CommTime /Calls : 3173.49 us
Grid : Message : 338.340601 s : WilsonFermion5D FaceTime /Calls : 473.12 us
Grid : Message : 338.340603 s : WilsonFermion5D ComputeTime1/Calls : 5.07821 us
Grid : Message : 338.340606 s : WilsonFermion5D ComputeTime2/Calls : 1065.26 us
Grid : Message : 338.340704 s : Average mflops/s per call : 6.05431e+10
Grid : Message : 338.340708 s : Average mflops/s per call per rank : 1.89197e+09
Grid : Message : 338.340710 s : Average mflops/s per call per node : 7.56789e+09
Grid : Message : 338.340713 s : Average mflops/s per call (full) : 7.67995e+07
Grid : Message : 338.340716 s : Average mflops/s per call per rank (full): 2.39998e+06
Grid : Message : 338.340719 s : Average mflops/s per call per node (full): 9.59993e+06
Grid : Message : 338.340722 s : WilsonFermion5D Stencil
Grid : Message : 338.340723 s : WilsonFermion5D StencilEven
Grid : Message : 338.340724 s : WilsonFermion5D StencilOdd
Grid : Message : 338.340725 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 338.340729 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 338.340730 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 393.531951 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 393.531972 s : Called DwDag
Grid : Message : 393.531973 s : norm dag result 12.0421
Grid : Message : 393.550274 s : norm dag ref 12.0421
Grid : Message : 393.566206 s : norm dag diff 7.28475e-14
Grid : Message : 393.614226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 393.987152 s : src_e0.499997
Grid : Message : 394.411352 s : src_o0.500003
Grid : Message : 394.510104 s : *********************************************************
Grid : Message : 394.510107 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 394.510108 s : * Vectorising space-time by 8
Grid : Message : 394.510109 s : * SINGLE precision
Grid : Message : 394.510110 s : * Using Overlapped Comms/Compute
Grid : Message : 394.510111 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 394.510112 s : *********************************************************
Grid : Message : 533.445236 s : Deo mflop/s = 7.65411e+07
Grid : Message : 533.445265 s : Deo mflop/s per rank 2.39191e+06
Grid : Message : 533.445267 s : Deo mflop/s per node 9.56764e+06
Grid : Message : 533.445270 s : #### Dhop calls report
Grid : Message : 533.445272 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 533.445274 s : WilsonFermion5D TotalTime /Calls : 4630.89 us
Grid : Message : 533.445276 s : WilsonFermion5D CommTime /Calls : 3066.01 us
Grid : Message : 533.445278 s : WilsonFermion5D FaceTime /Calls : 580.865 us
Grid : Message : 533.445280 s : WilsonFermion5D ComputeTime1/Calls : 6.1052 us
Grid : Message : 533.445282 s : WilsonFermion5D ComputeTime2/Calls : 1011.57 us
Grid : Message : 533.445307 s : Average mflops/s per call : 5.14748e+10
Grid : Message : 533.445313 s : Average mflops/s per call per rank : 1.60859e+09
Grid : Message : 533.445316 s : Average mflops/s per call per node : 6.43435e+09
Grid : Message : 533.445319 s : Average mflops/s per call (full) : 7.79067e+07
Grid : Message : 533.445323 s : Average mflops/s per call per rank (full): 2.43458e+06
Grid : Message : 533.445326 s : Average mflops/s per call per node (full): 9.73833e+06
Grid : Message : 533.445328 s : WilsonFermion5D Stencil
Grid : Message : 533.445330 s : WilsonFermion5D StencilEven
Grid : Message : 533.445332 s : WilsonFermion5D StencilOdd
Grid : Message : 533.445334 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 533.445336 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 533.445337 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 533.517980 s : r_e6.02113
Grid : Message : 533.525139 s : r_o6.02101
Grid : Message : 533.531555 s : res12.0421
Grid : Message : 534.208348 s : norm diff 0
Grid : Message : 534.958399 s : norm diff even 0
Grid : Message : 535.407067 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1080
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1080
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 02:58:01 BST 2022
epoch 1661219881

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:19:28 BST 2022
epoch 1661221168

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe10fbb000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e051810000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e051448000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e050f56000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e050c2c000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e05094b000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e0506ea000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e051797000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e05030a000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e04ebae000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e04e7de000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e04e53d000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e04e412000)
libm.so.6 => /lib64/libm.so.6 (0x000014e04e090000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e04de59000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e04dc41000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e04da21000)
libc.so.6 => /lib64/libc.so.6 (0x000014e04d65c000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014e04d458000)
/lib64/ld-linux-x86-64.so.2 (0x000014e051660000)
librt.so.1 => /lib64/librt.so.1 (0x000014e04d250000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e0516cb000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e0516c6000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e04d144000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e04cf3a000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014e04cd36000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x145b60000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.417949 s : Grid Layout
Grid : Message : 1.417957 s : Global lattice size : 64 64 64 128
Grid : Message : 1.417963 s : OpenMP threads : 4
Grid : Message : 1.417965 s : MPI tasks : 2 2 2 4
Grid : Message : 1.456030 s : Making s innermost grids
Grid : Message : 1.519833 s : Initialising 4d RNG
Grid : Message : 1.609461 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.609488 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.407737 s : Initialising 5d RNG
Grid : Message : 3.807194 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 3.807228 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.500197 s : Initialised RNGs
Grid : Message : 25.140001 s : Drawing gauge field
Grid : Message : 25.625310 s : Random gauge initialised
Grid : Message : 25.637123 s : Setting up Cshift based reference
Grid : Message : 54.900199 s : *****************************************************************
Grid : Message : 54.900217 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.900219 s : *****************************************************************
Grid : Message : 54.900220 s : *****************************************************************
Grid : Message : 54.900221 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.900222 s : * Vectorising space-time by 8
Grid : Message : 54.900223 s : * VComplexF size is 64 B
Grid : Message : 54.900224 s : * SINGLE precision
Grid : Message : 54.900226 s : * Using Overlapped Comms/Compute
Grid : Message : 54.900227 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.900228 s : *****************************************************************
Grid : Message : 56.902046 s : Called warmup
Grid : Message : 338.493870 s : Called Dw 30000 times in 2.81147e+08 us
Grid : Message : 338.494560 s : mflop/s = 7.56192e+07
Grid : Message : 338.494630 s : mflop/s per rank = 2.3631e+06
Grid : Message : 338.494660 s : mflop/s per node = 9.4524e+06
Grid : Message : 338.494690 s : RF GiB/s (base 2) = 153656
Grid : Message : 338.494720 s : mem GiB/s (base 2) = 96035.3
Grid : Message : 338.529840 s : norm diff 1.07359e-13
Grid : Message : 338.101662 s : #### Dhop calls report
Grid : Message : 338.101668 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 338.101671 s : WilsonFermion5D TotalTime /Calls : 4689.71 us
Grid : Message : 338.101673 s : WilsonFermion5D CommTime /Calls : 3173.53 us
Grid : Message : 338.101675 s : WilsonFermion5D FaceTime /Calls : 471.412 us
Grid : Message : 338.101677 s : WilsonFermion5D ComputeTime1/Calls : 4.96082 us
Grid : Message : 338.101679 s : WilsonFermion5D ComputeTime2/Calls : 1058.91 us
Grid : Message : 338.101768 s : Average mflops/s per call : 6.14534e+10
Grid : Message : 338.101771 s : Average mflops/s per call per rank : 1.92042e+09
Grid : Message : 338.101773 s : Average mflops/s per call per node : 7.68168e+09
Grid : Message : 338.101775 s : Average mflops/s per call (full) : 7.69295e+07
Grid : Message : 338.101777 s : Average mflops/s per call per rank (full): 2.40405e+06
Grid : Message : 338.101779 s : Average mflops/s per call per node (full): 9.61619e+06
Grid : Message : 338.101781 s : WilsonFermion5D Stencil
Grid : Message : 338.101782 s : WilsonFermion5D StencilEven
Grid : Message : 338.101783 s : WilsonFermion5D StencilOdd
Grid : Message : 338.101784 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 338.101785 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 338.101786 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 393.332960 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 393.333200 s : Called DwDag
Grid : Message : 393.333210 s : norm dag result 12.0421
Grid : Message : 393.535800 s : norm dag ref 12.0421
Grid : Message : 393.694460 s : norm dag diff 7.28475e-14
Grid : Message : 393.117660 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 393.554222 s : src_e0.499997
Grid : Message : 393.883369 s : src_o0.500003
Grid : Message : 393.981150 s : *********************************************************
Grid : Message : 393.981152 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 393.981154 s : * Vectorising space-time by 8
Grid : Message : 393.981155 s : * SINGLE precision
Grid : Message : 393.981156 s : * Using Overlapped Comms/Compute
Grid : Message : 393.981157 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 393.981158 s : *********************************************************
Grid : Message : 532.261495 s : Deo mflop/s = 7.69025e+07
Grid : Message : 532.261527 s : Deo mflop/s per rank 2.4032e+06
Grid : Message : 532.261529 s : Deo mflop/s per node 9.61281e+06
Grid : Message : 532.261532 s : #### Dhop calls report
Grid : Message : 532.261534 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 532.261536 s : WilsonFermion5D TotalTime /Calls : 4609.05 us
Grid : Message : 532.261538 s : WilsonFermion5D CommTime /Calls : 3043.36 us
Grid : Message : 532.261540 s : WilsonFermion5D FaceTime /Calls : 576.704 us
Grid : Message : 532.261542 s : WilsonFermion5D ComputeTime1/Calls : 6.1045 us
Grid : Message : 532.261544 s : WilsonFermion5D ComputeTime2/Calls : 1016.01 us
Grid : Message : 532.261569 s : Average mflops/s per call : 5.12445e+10
Grid : Message : 532.261572 s : Average mflops/s per call per rank : 1.60139e+09
Grid : Message : 532.261574 s : Average mflops/s per call per node : 6.40556e+09
Grid : Message : 532.261576 s : Average mflops/s per call (full) : 7.82759e+07
Grid : Message : 532.261578 s : Average mflops/s per call per rank (full): 2.44612e+06
Grid : Message : 532.261580 s : Average mflops/s per call per node (full): 9.78449e+06
Grid : Message : 532.261582 s : WilsonFermion5D Stencil
Grid : Message : 532.261583 s : WilsonFermion5D StencilEven
Grid : Message : 532.261585 s : WilsonFermion5D StencilOdd
Grid : Message : 532.261586 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 532.261587 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 532.261588 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 532.334163 s : r_e6.02113
Grid : Message : 532.341225 s : r_o6.02101
Grid : Message : 532.347608 s : res12.0421
Grid : Message : 533.303030 s : norm diff 0
Grid : Message : 533.802608 s : norm diff even 0
Grid : Message : 534.170331 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1095
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1095
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:10:28 BST 2022
epoch 1661220628

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:31:53 BST 2022
epoch 1661221913

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffe747d5000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a1749b3000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a1745eb000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a1740f9000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a173dcf000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a173aee000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a17388d000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a17493a000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a1734ad000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a171d51000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a171981000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a1716e0000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a1715b5000)
libm.so.6 => /lib64/libm.so.6 (0x000014a171233000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a170ffc000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a170de4000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a170bc4000)
libc.so.6 => /lib64/libc.so.6 (0x000014a1707ff000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014a1705fb000)
/lib64/ld-linux-x86-64.so.2 (0x000014a174803000)
librt.so.1 => /lib64/librt.so.1 (0x000014a1703f3000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a17486e000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a174869000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a1702e7000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a1700dd000)
libutil.so.1 => /lib64/libutil.so.1 (0x000014a16fed9000)

View File

@ -0,0 +1,254 @@
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150400000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.553137 s : Grid Layout
Grid : Message : 1.553140 s : Global lattice size : 64 64 64 128
Grid : Message : 1.553144 s : OpenMP threads : 4
Grid : Message : 1.553146 s : MPI tasks : 2 2 2 4
Grid : Message : 1.591415 s : Making s innermost grids
Grid : Message : 1.643527 s : Initialising 4d RNG
Grid : Message : 1.733769 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.733793 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.687944 s : Initialising 5d RNG
Grid : Message : 4.932540 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.933190 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 21.773430 s : Initialised RNGs
Grid : Message : 24.775839 s : Drawing gauge field
Grid : Message : 25.506673 s : Random gauge initialised
Grid : Message : 25.516620 s : Setting up Cshift based reference
Grid : Message : 54.420958 s : *****************************************************************
Grid : Message : 54.420980 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.420981 s : *****************************************************************
Grid : Message : 54.420982 s : *****************************************************************
Grid : Message : 54.420983 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.420984 s : * Vectorising space-time by 8
Grid : Message : 54.420985 s : * VComplexF size is 64 B
Grid : Message : 54.420986 s : * SINGLE precision
Grid : Message : 54.420987 s : * Using Overlapped Comms/Compute
Grid : Message : 54.420988 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.420989 s : *****************************************************************
Grid : Message : 56.418238 s : Called warmup
Grid : Message : 337.944860 s : Called Dw 30000 times in 2.80676e+08 us
Grid : Message : 337.945460 s : mflop/s = 7.5746e+07
Grid : Message : 337.945480 s : mflop/s per rank = 2.36706e+06
Grid : Message : 337.945500 s : mflop/s per node = 9.46825e+06
Grid : Message : 337.945520 s : RF GiB/s (base 2) = 153914
Grid : Message : 337.945540 s : mem GiB/s (base 2) = 96196.3
Grid : Message : 337.980600 s : norm diff 1.07359e-13
Grid : Message : 337.146303 s : #### Dhop calls report
Grid : Message : 337.146310 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 337.146313 s : WilsonFermion5D TotalTime /Calls : 4682.02 us
Grid : Message : 337.146315 s : WilsonFermion5D CommTime /Calls : 3157.4 us
Grid : Message : 337.146317 s : WilsonFermion5D FaceTime /Calls : 471.742 us
Grid : Message : 337.146319 s : WilsonFermion5D ComputeTime1/Calls : 4.66553 us
Grid : Message : 337.146324 s : WilsonFermion5D ComputeTime2/Calls : 1066.42 us
Grid : Message : 337.146339 s : Average mflops/s per call : 6.05546e+10
Grid : Message : 337.146342 s : Average mflops/s per call per rank : 1.89233e+09
Grid : Message : 337.146344 s : Average mflops/s per call per node : 7.56933e+09
Grid : Message : 337.146347 s : Average mflops/s per call (full) : 7.70559e+07
Grid : Message : 337.146349 s : Average mflops/s per call per rank (full): 2.408e+06
Grid : Message : 337.146352 s : Average mflops/s per call per node (full): 9.63198e+06
Grid : Message : 337.146354 s : WilsonFermion5D Stencil
Grid : Message : 337.146355 s : WilsonFermion5D StencilEven
Grid : Message : 337.146356 s : WilsonFermion5D StencilOdd
Grid : Message : 337.146357 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 337.146358 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 337.146361 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 392.570148 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 392.570167 s : Called DwDag
Grid : Message : 392.570168 s : norm dag result 12.0421
Grid : Message : 392.597817 s : norm dag ref 12.0421
Grid : Message : 392.613608 s : norm dag diff 7.28475e-14
Grid : Message : 392.655240 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 393.337360 s : src_e0.499997
Grid : Message : 393.428755 s : src_o0.500003
Grid : Message : 393.551540 s : *********************************************************
Grid : Message : 393.551542 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 393.551543 s : * Vectorising space-time by 8
Grid : Message : 393.551544 s : * SINGLE precision
Grid : Message : 393.551545 s : * Using Overlapped Comms/Compute
Grid : Message : 393.551546 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 393.551547 s : *********************************************************
Grid : Message : 531.331972 s : Deo mflop/s = 7.71814e+07
Grid : Message : 531.331998 s : Deo mflop/s per rank 2.41192e+06
Grid : Message : 531.332000 s : Deo mflop/s per node 9.64768e+06
Grid : Message : 531.332003 s : #### Dhop calls report
Grid : Message : 531.332008 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 531.332011 s : WilsonFermion5D TotalTime /Calls : 4592.41 us
Grid : Message : 531.332015 s : WilsonFermion5D CommTime /Calls : 3024.25 us
Grid : Message : 531.332017 s : WilsonFermion5D FaceTime /Calls : 573.717 us
Grid : Message : 531.332020 s : WilsonFermion5D ComputeTime1/Calls : 5.83554 us
Grid : Message : 531.332023 s : WilsonFermion5D ComputeTime2/Calls : 1021.88 us
Grid : Message : 531.332042 s : Average mflops/s per call : 5.06597e+10
Grid : Message : 531.332046 s : Average mflops/s per call per rank : 1.58312e+09
Grid : Message : 531.332048 s : Average mflops/s per call per node : 6.33246e+09
Grid : Message : 531.332050 s : Average mflops/s per call (full) : 7.85594e+07
Grid : Message : 531.332053 s : Average mflops/s per call per rank (full): 2.45498e+06
Grid : Message : 531.332055 s : Average mflops/s per call per node (full): 9.81993e+06
Grid : Message : 531.332057 s : WilsonFermion5D Stencil
Grid : Message : 531.332059 s : WilsonFermion5D StencilEven
Grid : Message : 531.332060 s : WilsonFermion5D StencilOdd
Grid : Message : 531.332063 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 531.332064 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 531.332065 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 531.403155 s : r_e6.02113
Grid : Message : 531.411033 s : r_o6.02101
Grid : Message : 531.417417 s : res12.0421
Grid : Message : 532.503110 s : norm diff 0
Grid : Message : 532.830118 s : norm diff even 0
Grid : Message : 533.297755 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1110
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1110
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:22:53 BST 2022
epoch 1661221373

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:44:16 BST 2022
epoch 1661222656

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc85c3e000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001496f34e3000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001496f311b000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001496f2c29000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001496f28ff000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001496f261e000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001496f23bd000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001496f346a000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001496f1fdd000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x00001496f0881000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001496f04b1000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001496f0210000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001496f00e5000)
libm.so.6 => /lib64/libm.so.6 (0x00001496efd63000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001496efb2c000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001496ef914000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00001496ef6f4000)
libc.so.6 => /lib64/libc.so.6 (0x00001496ef32f000)
libdl.so.2 => /lib64/libdl.so.2 (0x00001496ef12b000)
/lib64/ld-linux-x86-64.so.2 (0x00001496f3333000)
librt.so.1 => /lib64/librt.so.1 (0x00001496eef23000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001496f339e000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001496f3399000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001496eee17000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001496eec0d000)
libutil.so.1 => /lib64/libutil.so.1 (0x00001496eea09000)

View File

@ -0,0 +1,254 @@
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 2 device 0 bus id: 0000:84:00.0
local rank 1 device 0 bus id: 0000:44:00.0
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x153200000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.477679 s : Grid Layout
Grid : Message : 1.477683 s : Global lattice size : 64 64 64 128
Grid : Message : 1.477687 s : OpenMP threads : 4
Grid : Message : 1.477689 s : MPI tasks : 2 2 2 4
Grid : Message : 1.518431 s : Making s innermost grids
Grid : Message : 1.578744 s : Initialising 4d RNG
Grid : Message : 1.670962 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.670985 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.479199 s : Initialising 5d RNG
Grid : Message : 3.928882 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 3.928918 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.621348 s : Initialised RNGs
Grid : Message : 24.791943 s : Drawing gauge field
Grid : Message : 25.611789 s : Random gauge initialised
Grid : Message : 25.623532 s : Setting up Cshift based reference
Grid : Message : 54.459836 s : *****************************************************************
Grid : Message : 54.459859 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 54.459860 s : *****************************************************************
Grid : Message : 54.459861 s : *****************************************************************
Grid : Message : 54.459862 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 54.459863 s : * Vectorising space-time by 8
Grid : Message : 54.459864 s : * VComplexF size is 64 B
Grid : Message : 54.459866 s : * SINGLE precision
Grid : Message : 54.459868 s : * Using Overlapped Comms/Compute
Grid : Message : 54.459869 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.459870 s : *****************************************************************
Grid : Message : 56.472989 s : Called warmup
Grid : Message : 336.723650 s : Called Dw 30000 times in 2.8025e+08 us
Grid : Message : 336.723702 s : mflop/s = 7.58611e+07
Grid : Message : 336.723705 s : mflop/s per rank = 2.37066e+06
Grid : Message : 336.723710 s : mflop/s per node = 9.48264e+06
Grid : Message : 336.723713 s : RF GiB/s (base 2) = 154148
Grid : Message : 336.723716 s : mem GiB/s (base 2) = 96342.5
Grid : Message : 336.727230 s : norm diff 1.07359e-13
Grid : Message : 336.775672 s : #### Dhop calls report
Grid : Message : 336.775679 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 336.775682 s : WilsonFermion5D TotalTime /Calls : 4674.86 us
Grid : Message : 336.775684 s : WilsonFermion5D CommTime /Calls : 3140.23 us
Grid : Message : 336.775686 s : WilsonFermion5D FaceTime /Calls : 472.444 us
Grid : Message : 336.775688 s : WilsonFermion5D ComputeTime1/Calls : 5.04048 us
Grid : Message : 336.775690 s : WilsonFermion5D ComputeTime2/Calls : 1076.29 us
Grid : Message : 336.775779 s : Average mflops/s per call : 6.27024e+10
Grid : Message : 336.775783 s : Average mflops/s per call per rank : 1.95945e+09
Grid : Message : 336.775785 s : Average mflops/s per call per node : 7.8378e+09
Grid : Message : 336.775787 s : Average mflops/s per call (full) : 7.71738e+07
Grid : Message : 336.775789 s : Average mflops/s per call per rank (full): 2.41168e+06
Grid : Message : 336.775791 s : Average mflops/s per call per node (full): 9.64673e+06
Grid : Message : 336.775793 s : WilsonFermion5D Stencil
Grid : Message : 336.775794 s : WilsonFermion5D StencilEven
Grid : Message : 336.775795 s : WilsonFermion5D StencilOdd
Grid : Message : 336.775796 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 336.775797 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 336.775798 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 391.954992 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 391.955015 s : Called DwDag
Grid : Message : 391.955016 s : norm dag result 12.0421
Grid : Message : 391.968492 s : norm dag ref 12.0421
Grid : Message : 391.984269 s : norm dag diff 7.28475e-14
Grid : Message : 392.244680 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 392.446039 s : src_e0.499997
Grid : Message : 392.821692 s : src_o0.500003
Grid : Message : 392.939557 s : *********************************************************
Grid : Message : 392.939563 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 392.939566 s : * Vectorising space-time by 8
Grid : Message : 392.939568 s : * SINGLE precision
Grid : Message : 392.939570 s : * Using Overlapped Comms/Compute
Grid : Message : 392.939573 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 392.939575 s : *********************************************************
Grid : Message : 530.161337 s : Deo mflop/s = 7.74985e+07
Grid : Message : 530.161369 s : Deo mflop/s per rank 2.42183e+06
Grid : Message : 530.161371 s : Deo mflop/s per node 9.68731e+06
Grid : Message : 530.161374 s : #### Dhop calls report
Grid : Message : 530.161376 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 530.161378 s : WilsonFermion5D TotalTime /Calls : 4573.78 us
Grid : Message : 530.161380 s : WilsonFermion5D CommTime /Calls : 3002.97 us
Grid : Message : 530.161382 s : WilsonFermion5D FaceTime /Calls : 567.519 us
Grid : Message : 530.161384 s : WilsonFermion5D ComputeTime1/Calls : 6.14496 us
Grid : Message : 530.161386 s : WilsonFermion5D ComputeTime2/Calls : 1030.26 us
Grid : Message : 530.161413 s : Average mflops/s per call : 5.0739e+10
Grid : Message : 530.161418 s : Average mflops/s per call per rank : 1.58559e+09
Grid : Message : 530.161420 s : Average mflops/s per call per node : 6.34237e+09
Grid : Message : 530.161424 s : Average mflops/s per call (full) : 7.88794e+07
Grid : Message : 530.161428 s : Average mflops/s per call per rank (full): 2.46498e+06
Grid : Message : 530.161432 s : Average mflops/s per call per node (full): 9.85993e+06
Grid : Message : 530.161436 s : WilsonFermion5D Stencil
Grid : Message : 530.161438 s : WilsonFermion5D StencilEven
Grid : Message : 530.161440 s : WilsonFermion5D StencilOdd
Grid : Message : 530.161442 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 530.161445 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 530.161448 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 530.232671 s : r_e6.02113
Grid : Message : 530.240193 s : r_o6.02101
Grid : Message : 530.246528 s : res12.0421
Grid : Message : 530.976149 s : norm diff 0
Grid : Message : 531.780007 s : norm diff even 0
Grid : Message : 532.298753 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

View File

@ -0,0 +1,112 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091,SC2050,SC2170
# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
#SBATCH -J power-loc32-8A-1125
#SBATCH -A dp207
#SBATCH -t 48:00:00
#SBATCH --nodes=8
#SBATCH --ntasks=32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --reservation=dc-port1_61
#SBATCH --qos=reservation
#SBATCH --no-requeue
set -e
# OpenMP/OpenMPI/UCX environment ###############################################
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export OMPI_MCA_pml=ucx
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
# IO environment ###############################################################
if [ 8 -eq 1 ]; then
export OMPI_MCA_io=ompio
else
export OMPI_MCA_io=romio321
fi
export OMPI_MCA_btl_openib_allow_ib=true
export OMPI_MCA_btl_openib_device_type=infiniband
export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
# load environment #############################################################
env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
source "${env_dir}/env-base.sh"
if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
source "${env_dir}/env-gpu.sh"
else
echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
exit 1
fi
spack load sshpass
# application and parameters ###################################################
app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
opt=('--comms-overlap' '--comms-concurrent')
par=''
# collect job information ######################################################
job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
mkdir -p "${job_info_dir}"
date > "${job_info_dir}/start-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/start-date"
set > "${job_info_dir}/env"
ldd ${app} > "${job_info_dir}/ldd"
md5sum ${app} > "${job_info_dir}/app-hash"
readelf -a ${app} > "${job_info_dir}/elf"
echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
cp "${BASH_SOURCE[0]}" "${job_info_dir}/script"
if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
# GPU frequency control ########################################################
power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
freq=1125
# set frequency
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
done
# start NVIDIA SMI monitoring
tmp=$(mktemp)
sleep 1
coproc nvidia-smi dmon -o DT &> "${tmp}"
# run! #########################################################################
mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
./gpu-mpi-wrapper.sh \
${app} "${par}" "${opt[@]}" \
--mpi 2.2.2.4 \
--accelerator-threads 8 \
--grid 64.64.64.128 \
--shm 2048 &> "${job_info_dir}/log"
# if we reach that point the application exited successfully ###################
touch "${job_info_dir}/success"
date > "${job_info_dir}/end-date"
echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
# reset GPUS ###################################################################
# stop monitoring
kill -INT "${COPROC_PID}"
# make monitoring DB
${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
# reset clocks
for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do
${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410'
done
################################################################################

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:35:17 BST 2022
epoch 1661222117

View File

@ -0,0 +1 @@
6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
Tue Aug 23 03:56:38 BST 2022
epoch 1661223398

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,26 @@
linux-vdso.so.1 (0x00007ffc77ffc000)
libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015363287b000)
libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001536324b3000)
libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000153631fc1000)
libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000153631c97000)
libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001536319b6000)
libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000153631755000)
libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000153632802000)
libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000153631375000)
libcuda.so.1 => /lib64/libcuda.so.1 (0x000015362fc19000)
libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015362f849000)
libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015362f5a8000)
libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015362f47d000)
libm.so.6 => /lib64/libm.so.6 (0x000015362f0fb000)
libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015362eec4000)
libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015362ecac000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000015362ea8c000)
libc.so.6 => /lib64/libc.so.6 (0x000015362e6c7000)
libdl.so.2 => /lib64/libdl.so.2 (0x000015362e4c3000)
/lib64/ld-linux-x86-64.so.2 (0x00001536326cb000)
librt.so.1 => /lib64/librt.so.1 (0x000015362e2bb000)
libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000153632736000)
libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000153632731000)
libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015362e1af000)
libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015362dfa5000)
libutil.so.1 => /lib64/libutil.so.1 (0x000015362dda1000)

View File

@ -0,0 +1,254 @@
tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
OPENMPI detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 3
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
OPENMPI detected
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
AcceleratorCudaInit: ================================================
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 3 device 0 bus id: 0000:C4:00.0
local rank 0 device 0 bus id: 0000:03:00.0
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
AcceleratorCudaInit: ================================================
local rank 1 device 0 bus id: 0000:44:00.0
local rank 2 device 0 bus id: 0000:84:00.0
SharedMemoryMpi: World communicator of size 32
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150380000000 for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 34004218675 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 1.525814 s : Grid Layout
Grid : Message : 1.525817 s : Global lattice size : 64 64 64 128
Grid : Message : 1.525823 s : OpenMP threads : 4
Grid : Message : 1.525825 s : MPI tasks : 2 2 2 4
Grid : Message : 1.564141 s : Making s innermost grids
Grid : Message : 1.612317 s : Initialising 4d RNG
Grid : Message : 1.709482 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 1.709506 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 2.629751 s : Initialising 5d RNG
Grid : Message : 4.980840 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 4.981520 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 20.874034 s : Initialised RNGs
Grid : Message : 25.403080 s : Drawing gauge field
Grid : Message : 25.963343 s : Random gauge initialised
Grid : Message : 25.975665 s : Setting up Cshift based reference
Grid : Message : 55.107124 s : *****************************************************************
Grid : Message : 55.107147 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 55.107149 s : *****************************************************************
Grid : Message : 55.107150 s : *****************************************************************
Grid : Message : 55.107151 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 55.107152 s : * Vectorising space-time by 8
Grid : Message : 55.107153 s : * VComplexF size is 64 B
Grid : Message : 55.107154 s : * SINGLE precision
Grid : Message : 55.107157 s : * Using Overlapped Comms/Compute
Grid : Message : 55.107158 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 55.107159 s : *****************************************************************
Grid : Message : 57.115222 s : Called warmup
Grid : Message : 337.196879 s : Called Dw 30000 times in 2.80081e+08 us
Grid : Message : 337.196925 s : mflop/s = 7.5907e+07
Grid : Message : 337.196927 s : mflop/s per rank = 2.37209e+06
Grid : Message : 337.196929 s : mflop/s per node = 9.48837e+06
Grid : Message : 337.196931 s : RF GiB/s (base 2) = 154241
Grid : Message : 337.196933 s : mem GiB/s (base 2) = 96400.8
Grid : Message : 337.200448 s : norm diff 1.07359e-13
Grid : Message : 337.249056 s : #### Dhop calls report
Grid : Message : 337.249062 s : WilsonFermion5D Number of DhopEO Calls : 60002
Grid : Message : 337.249069 s : WilsonFermion5D TotalTime /Calls : 4671.91 us
Grid : Message : 337.249072 s : WilsonFermion5D CommTime /Calls : 3138.67 us
Grid : Message : 337.249075 s : WilsonFermion5D FaceTime /Calls : 467.933 us
Grid : Message : 337.249078 s : WilsonFermion5D ComputeTime1/Calls : 4.97537 us
Grid : Message : 337.249081 s : WilsonFermion5D ComputeTime2/Calls : 1078.84 us
Grid : Message : 337.249093 s : Average mflops/s per call : 6.2791e+10
Grid : Message : 337.249096 s : Average mflops/s per call per rank : 1.96222e+09
Grid : Message : 337.249099 s : Average mflops/s per call per node : 7.84887e+09
Grid : Message : 337.249103 s : Average mflops/s per call (full) : 7.72227e+07
Grid : Message : 337.249106 s : Average mflops/s per call per rank (full): 2.41321e+06
Grid : Message : 337.249109 s : Average mflops/s per call per node (full): 9.65284e+06
Grid : Message : 337.249111 s : WilsonFermion5D Stencil
Grid : Message : 337.249113 s : WilsonFermion5D StencilEven
Grid : Message : 337.249115 s : WilsonFermion5D StencilOdd
Grid : Message : 337.249116 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 337.249118 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 337.249119 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 392.546037 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 392.546061 s : Called DwDag
Grid : Message : 392.546062 s : norm dag result 12.0421
Grid : Message : 392.593558 s : norm dag ref 12.0421
Grid : Message : 392.609258 s : norm dag diff 7.28475e-14
Grid : Message : 392.657672 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 393.911450 s : src_e0.499997
Grid : Message : 393.412726 s : src_o0.500003
Grid : Message : 393.510751 s : *********************************************************
Grid : Message : 393.510754 s : * Benchmarking DomainWallFermionF::DhopEO
Grid : Message : 393.510755 s : * Vectorising space-time by 8
Grid : Message : 393.510756 s : * SINGLE precision
Grid : Message : 393.510757 s : * Using Overlapped Comms/Compute
Grid : Message : 393.510758 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 393.510759 s : *********************************************************
Grid : Message : 530.311860 s : Deo mflop/s = 7.77338e+07
Grid : Message : 530.311887 s : Deo mflop/s per rank 2.42918e+06
Grid : Message : 530.311889 s : Deo mflop/s per node 9.71673e+06
Grid : Message : 530.311892 s : #### Dhop calls report
Grid : Message : 530.311894 s : WilsonFermion5D Number of DhopEO Calls : 30001
Grid : Message : 530.311896 s : WilsonFermion5D TotalTime /Calls : 4559.74 us
Grid : Message : 530.311898 s : WilsonFermion5D CommTime /Calls : 2983.56 us
Grid : Message : 530.311900 s : WilsonFermion5D FaceTime /Calls : 561.612 us
Grid : Message : 530.311902 s : WilsonFermion5D ComputeTime1/Calls : 6.06806 us
Grid : Message : 530.311904 s : WilsonFermion5D ComputeTime2/Calls : 1041.48 us
Grid : Message : 530.311932 s : Average mflops/s per call : 5.13843e+10
Grid : Message : 530.311937 s : Average mflops/s per call per rank : 1.60576e+09
Grid : Message : 530.311940 s : Average mflops/s per call per node : 6.42304e+09
Grid : Message : 530.311944 s : Average mflops/s per call (full) : 7.91223e+07
Grid : Message : 530.311947 s : Average mflops/s per call per rank (full): 2.47257e+06
Grid : Message : 530.311951 s : Average mflops/s per call per node (full): 9.89028e+06
Grid : Message : 530.311954 s : WilsonFermion5D Stencil
Grid : Message : 530.311957 s : WilsonFermion5D StencilEven
Grid : Message : 530.311958 s : WilsonFermion5D StencilOdd
Grid : Message : 530.311961 s : WilsonFermion5D Stencil Reporti()
Grid : Message : 530.311963 s : WilsonFermion5D StencilEven Reporti()
Grid : Message : 530.311965 s : WilsonFermion5D StencilOdd Reporti()
Grid : Message : 530.389174 s : r_e6.02113
Grid : Message : 530.397070 s : r_o6.02101
Grid : Message : 530.403387 s : res12.0421
Grid : Message : 531.146771 s : norm diff 0
Grid : Message : 531.837346 s : norm diff even 0
Grid : Message : 532.217730 s : norm diff odd 0

View File

@ -0,0 +1 @@
tu-c0r1n[72,75,78,81,84,87,90,93]

Some files were not shown because too many files have changed in this diff Show More