Initial commit

2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:25:04 BST 2022
+epoch 1661023504
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc4036a000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000147189068000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000147188ca0000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001471887ae000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000147188484000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001471881a3000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000147187f42000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000147188fef000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000147187b62000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000147186406000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000147186036000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000147185d95000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000147185c6a000)
+	libm.so.6 => /lib64/libm.so.6 (0x00001471858e8000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001471856b1000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000147185499000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000147185279000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000147184eb4000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000147184cb0000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000147188eb8000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000147184aa8000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000147188f23000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000147188f1e000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014718499c000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000147184792000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014718458e000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14cfc0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.291243 s : Grid Layout
+Grid : Message : 1.291247 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.291254 s : 	OpenMP threads       : 4
+Grid : Message : 1.291258 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.303822 s : Making s innermost grids
+Grid : Message : 1.320388 s : Initialising 4d RNG
+Grid : Message : 1.336702 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.336725 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.536145 s : Initialising 5d RNG
+Grid : Message : 1.776849 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.776873 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.482939 s : Initialised RNGs
+Grid : Message : 5.341477 s : Drawing gauge field
+Grid : Message : 5.450363 s : Random gauge initialised 
+Grid : Message : 5.454302 s : Setting up Cshift based reference 
+Grid : Message : 10.483446 s : *****************************************************************
+Grid : Message : 10.483466 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.483468 s : *****************************************************************
+Grid : Message : 10.483469 s : *****************************************************************
+Grid : Message : 10.483470 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.483471 s : * Vectorising space-time by 8
+Grid : Message : 10.483472 s : * VComplexF size is 64 B
+Grid : Message : 10.483473 s : * SINGLE precision 
+Grid : Message : 10.483474 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.483475 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.483476 s : *****************************************************************
+Grid : Message : 11.249590 s : Called warmup
+Grid : Message : 100.539489 s : Called Dw 30000 times in 8.95142e+07 us
+Grid : Message : 100.539542 s : mflop/s =   3.75741e+07
+Grid : Message : 100.539544 s : mflop/s per rank =  1.17419e+06
+Grid : Message : 100.539546 s : mflop/s per node =  4.69676e+06
+Grid : Message : 100.539548 s : RF  GiB/s (base 2) =   76349.6
+Grid : Message : 100.539550 s : mem GiB/s (base 2) =   47718.5
+Grid : Message : 100.540119 s : norm diff   1.05759e-13
+Grid : Message : 100.549682 s : #### Dhop calls report 
+Grid : Message : 100.549689 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 100.549693 s : WilsonFermion5D TotalTime   /Calls        : 1493.26 us
+Grid : Message : 100.549695 s : WilsonFermion5D CommTime    /Calls        : 1049.79 us
+Grid : Message : 100.549697 s : WilsonFermion5D FaceTime    /Calls        : 219.441 us
+Grid : Message : 100.549699 s : WilsonFermion5D ComputeTime1/Calls        : 2.73708 us
+Grid : Message : 100.549701 s : WilsonFermion5D ComputeTime2/Calls        : 236.764 us
+Grid : Message : 100.549730 s : Average mflops/s per call                : 1.77575e+10
+Grid : Message : 100.549734 s : Average mflops/s per call per rank       : 5.54921e+08
+Grid : Message : 100.549736 s : Average mflops/s per call per node       : 2.21968e+09
+Grid : Message : 100.549738 s : Average mflops/s per call (full)         : 3.82224e+07
+Grid : Message : 100.549741 s : Average mflops/s per call per rank (full): 1.19445e+06
+Grid : Message : 100.549743 s : Average mflops/s per call per node (full): 4.7778e+06
+Grid : Message : 100.549745 s : WilsonFermion5D Stencil
+Grid : Message : 100.549746 s : WilsonFermion5D StencilEven
+Grid : Message : 100.549749 s : WilsonFermion5D StencilOdd
+Grid : Message : 100.549750 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 100.549754 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 100.549757 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 109.252306 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 109.252327 s : Called DwDag
+Grid : Message : 109.252328 s : norm dag result 12.0422
+Grid : Message : 109.255491 s : norm dag ref    12.0422
+Grid : Message : 109.258528 s : norm dag diff   7.13141e-14
+Grid : Message : 109.270823 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 109.336420 s : src_e0.499992
+Grid : Message : 109.408759 s : src_o0.500008
+Grid : Message : 109.425239 s : *********************************************************
+Grid : Message : 109.425244 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 109.425246 s : * Vectorising space-time by 8
+Grid : Message : 109.425247 s : * SINGLE precision 
+Grid : Message : 109.425249 s : * Using Overlapped Comms/Compute
+Grid : Message : 109.425251 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 109.425252 s : *********************************************************
+Grid : Message : 157.753385 s : Deo mflop/s =   3.4805e+07
+Grid : Message : 157.753416 s : Deo mflop/s per rank   1.08766e+06
+Grid : Message : 157.753418 s : Deo mflop/s per node   4.35063e+06
+Grid : Message : 157.753421 s : #### Dhop calls report 
+Grid : Message : 157.753423 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 157.753426 s : WilsonFermion5D TotalTime   /Calls        : 1610.74 us
+Grid : Message : 157.753428 s : WilsonFermion5D CommTime    /Calls        : 1101.75 us
+Grid : Message : 157.753430 s : WilsonFermion5D FaceTime    /Calls        : 290.394 us
+Grid : Message : 157.753432 s : WilsonFermion5D ComputeTime1/Calls        : 4.75421 us
+Grid : Message : 157.753434 s : WilsonFermion5D ComputeTime2/Calls        : 242.784 us
+Grid : Message : 157.753456 s : Average mflops/s per call                : 1.02081e+10
+Grid : Message : 157.753460 s : Average mflops/s per call per rank       : 3.19003e+08
+Grid : Message : 157.753462 s : Average mflops/s per call per node       : 1.27601e+09
+Grid : Message : 157.753464 s : Average mflops/s per call (full)         : 3.54347e+07
+Grid : Message : 157.753467 s : Average mflops/s per call per rank (full): 1.10733e+06
+Grid : Message : 157.753469 s : Average mflops/s per call per node (full): 4.42934e+06
+Grid : Message : 157.753472 s : WilsonFermion5D Stencil
+Grid : Message : 157.753473 s : WilsonFermion5D StencilEven
+Grid : Message : 157.753476 s : WilsonFermion5D StencilOdd
+Grid : Message : 157.753478 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 157.753479 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 157.753481 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 157.773486 s : r_e6.02129
+Grid : Message : 157.775479 s : r_o6.02097
+Grid : Message : 157.776926 s : res12.0423
+Grid : Message : 157.891008 s : norm diff   0
+Grid : Message : 158.245750 s : norm diff even  0
+Grid : Message : 158.961270 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1005
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1005
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:22:20 BST 2022
+epoch 1661023340
--- a/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1005.64061/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:37:28 BST 2022
+epoch 1661024248
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc9ffef000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ec1aeaa000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ec1aae2000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ec1a5f0000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ec1a2c6000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ec19fe5000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ec19d84000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ec1ae31000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ec199a4000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ec18248000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ec17e78000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ec17bd7000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ec17aac000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014ec1772a000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ec174f3000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ec172db000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ec170bb000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014ec16cf6000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014ec16af2000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014ec1acfa000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014ec168ea000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ec1ad65000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ec1ad60000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ec167de000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ec165d4000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014ec163d0000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/log
@@ -0,0 +1,254 @@
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f0c0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.274727 s : Grid Layout
+Grid : Message : 1.274731 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.274738 s : 	OpenMP threads       : 4
+Grid : Message : 1.274742 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.286239 s : Making s innermost grids
+Grid : Message : 1.296640 s : Initialising 4d RNG
+Grid : Message : 1.313085 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.313104 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.438915 s : Initialising 5d RNG
+Grid : Message : 1.670684 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.670710 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.517605 s : Initialised RNGs
+Grid : Message : 5.165082 s : Drawing gauge field
+Grid : Message : 5.272845 s : Random gauge initialised 
+Grid : Message : 5.287691 s : Setting up Cshift based reference 
+Grid : Message : 10.356424 s : *****************************************************************
+Grid : Message : 10.356441 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.356442 s : *****************************************************************
+Grid : Message : 10.356443 s : *****************************************************************
+Grid : Message : 10.356444 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.356445 s : * Vectorising space-time by 8
+Grid : Message : 10.356447 s : * VComplexF size is 64 B
+Grid : Message : 10.356448 s : * SINGLE precision 
+Grid : Message : 10.356449 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.356450 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.356451 s : *****************************************************************
+Grid : Message : 10.894078 s : Called warmup
+Grid : Message : 99.665065 s : Called Dw 30000 times in 8.8771e+07 us
+Grid : Message : 99.665118 s : mflop/s =   3.78887e+07
+Grid : Message : 99.665120 s : mflop/s per rank =  1.18402e+06
+Grid : Message : 99.665122 s : mflop/s per node =  4.73608e+06
+Grid : Message : 99.665124 s : RF  GiB/s (base 2) =   76988.9
+Grid : Message : 99.665126 s : mem GiB/s (base 2) =   48118
+Grid : Message : 99.665697 s : norm diff   1.05759e-13
+Grid : Message : 99.675870 s : #### Dhop calls report 
+Grid : Message : 99.675877 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 99.675880 s : WilsonFermion5D TotalTime   /Calls        : 1480.69 us
+Grid : Message : 99.675882 s : WilsonFermion5D CommTime    /Calls        : 1038.97 us
+Grid : Message : 99.675885 s : WilsonFermion5D FaceTime    /Calls        : 219.112 us
+Grid : Message : 99.675887 s : WilsonFermion5D ComputeTime1/Calls        : 2.79427 us
+Grid : Message : 99.675889 s : WilsonFermion5D ComputeTime2/Calls        : 235.635 us
+Grid : Message : 99.675899 s : Average mflops/s per call                : 1.78613e+10
+Grid : Message : 99.675906 s : Average mflops/s per call per rank       : 5.58166e+08
+Grid : Message : 99.675909 s : Average mflops/s per call per node       : 2.23266e+09
+Grid : Message : 99.675911 s : Average mflops/s per call (full)         : 3.85468e+07
+Grid : Message : 99.675914 s : Average mflops/s per call per rank (full): 1.20459e+06
+Grid : Message : 99.675917 s : Average mflops/s per call per node (full): 4.81836e+06
+Grid : Message : 99.675920 s : WilsonFermion5D Stencil
+Grid : Message : 99.675921 s : WilsonFermion5D StencilEven
+Grid : Message : 99.675922 s : WilsonFermion5D StencilOdd
+Grid : Message : 99.675924 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 99.675929 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 99.675930 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 108.331185 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 108.331205 s : Called DwDag
+Grid : Message : 108.331206 s : norm dag result 12.0422
+Grid : Message : 108.333524 s : norm dag ref    12.0422
+Grid : Message : 108.336555 s : norm dag diff   7.13141e-14
+Grid : Message : 108.347667 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 108.409420 s : src_e0.499992
+Grid : Message : 108.483354 s : src_o0.500008
+Grid : Message : 108.500169 s : *********************************************************
+Grid : Message : 108.500173 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 108.500175 s : * Vectorising space-time by 8
+Grid : Message : 108.500176 s : * SINGLE precision 
+Grid : Message : 108.500177 s : * Using Overlapped Comms/Compute
+Grid : Message : 108.500178 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 108.500179 s : *********************************************************
+Grid : Message : 156.376888 s : Deo mflop/s =   3.51332e+07
+Grid : Message : 156.376919 s : Deo mflop/s per rank   1.09791e+06
+Grid : Message : 156.376921 s : Deo mflop/s per node   4.39165e+06
+Grid : Message : 156.376924 s : #### Dhop calls report 
+Grid : Message : 156.376926 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 156.376929 s : WilsonFermion5D TotalTime   /Calls        : 1595.69 us
+Grid : Message : 156.376931 s : WilsonFermion5D CommTime    /Calls        : 1087.54 us
+Grid : Message : 156.376933 s : WilsonFermion5D FaceTime    /Calls        : 292.342 us
+Grid : Message : 156.376935 s : WilsonFermion5D ComputeTime1/Calls        : 4.75321 us
+Grid : Message : 156.376937 s : WilsonFermion5D ComputeTime2/Calls        : 240.424 us
+Grid : Message : 156.376963 s : Average mflops/s per call                : 1.02133e+10
+Grid : Message : 156.376967 s : Average mflops/s per call per rank       : 3.19165e+08
+Grid : Message : 156.376970 s : Average mflops/s per call per node       : 1.27666e+09
+Grid : Message : 156.376975 s : Average mflops/s per call (full)         : 3.57688e+07
+Grid : Message : 156.376979 s : Average mflops/s per call per rank (full): 1.11778e+06
+Grid : Message : 156.376984 s : Average mflops/s per call per node (full): 4.4711e+06
+Grid : Message : 156.376988 s : WilsonFermion5D Stencil
+Grid : Message : 156.376990 s : WilsonFermion5D StencilEven
+Grid : Message : 156.376991 s : WilsonFermion5D StencilOdd
+Grid : Message : 156.376994 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 156.376996 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 156.376998 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 156.396805 s : r_e6.02129
+Grid : Message : 156.398572 s : r_o6.02097
+Grid : Message : 156.400042 s : res12.0423
+Grid : Message : 156.511360 s : norm diff   0
+Grid : Message : 156.646367 s : norm diff even  0
+Grid : Message : 156.715079 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1020
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1020
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:34:46 BST 2022
+epoch 1661024086
--- a/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1020.64065/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:43:19 BST 2022
+epoch 1661024599
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffda5149000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000147cdc012000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000147cdbc4a000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000147cdb758000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000147cdb42e000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000147cdb14d000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000147cdaeec000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000147cdbf99000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000147cdab0c000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000147cd93b0000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000147cd8fe0000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000147cd8d3f000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000147cd8c14000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000147cd8892000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000147cd865b000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000147cd8443000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000147cd8223000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000147cd7e5e000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000147cd7c5a000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000147cdbe62000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000147cd7a52000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000147cdbecd000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000147cdbec8000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000147cd7946000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000147cd773c000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000147cd7538000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150480000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.488326 s : Grid Layout
+Grid : Message : 1.488330 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.488336 s : 	OpenMP threads       : 4
+Grid : Message : 1.488339 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.502272 s : Making s innermost grids
+Grid : Message : 1.518383 s : Initialising 4d RNG
+Grid : Message : 1.534282 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.534304 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.779780 s : Initialising 5d RNG
+Grid : Message : 2.102130 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 2.102560 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.823411 s : Initialised RNGs
+Grid : Message : 5.679533 s : Drawing gauge field
+Grid : Message : 5.765020 s : Random gauge initialised 
+Grid : Message : 5.769069 s : Setting up Cshift based reference 
+Grid : Message : 10.830431 s : *****************************************************************
+Grid : Message : 10.830449 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.830451 s : *****************************************************************
+Grid : Message : 10.830452 s : *****************************************************************
+Grid : Message : 10.830453 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.830454 s : * Vectorising space-time by 8
+Grid : Message : 10.830455 s : * VComplexF size is 64 B
+Grid : Message : 10.830456 s : * SINGLE precision 
+Grid : Message : 10.830457 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.830458 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.830459 s : *****************************************************************
+Grid : Message : 11.332763 s : Called warmup
+Grid : Message : 99.153092 s : Called Dw 30000 times in 8.78201e+07 us
+Grid : Message : 99.153144 s : mflop/s =   3.82989e+07
+Grid : Message : 99.153146 s : mflop/s per rank =  1.19684e+06
+Grid : Message : 99.153148 s : mflop/s per node =  4.78736e+06
+Grid : Message : 99.153150 s : RF  GiB/s (base 2) =   77822.4
+Grid : Message : 99.153152 s : mem GiB/s (base 2) =   48639
+Grid : Message : 99.153722 s : norm diff   1.05759e-13
+Grid : Message : 99.164069 s : #### Dhop calls report 
+Grid : Message : 99.164076 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 99.164079 s : WilsonFermion5D TotalTime   /Calls        : 1464.53 us
+Grid : Message : 99.164081 s : WilsonFermion5D CommTime    /Calls        : 1021.47 us
+Grid : Message : 99.164083 s : WilsonFermion5D FaceTime    /Calls        : 219.776 us
+Grid : Message : 99.164085 s : WilsonFermion5D ComputeTime1/Calls        : 2.8622 us
+Grid : Message : 99.164087 s : WilsonFermion5D ComputeTime2/Calls        : 235.73 us
+Grid : Message : 99.164105 s : Average mflops/s per call                : 1.77625e+10
+Grid : Message : 99.164108 s : Average mflops/s per call per rank       : 5.55077e+08
+Grid : Message : 99.164110 s : Average mflops/s per call per node       : 2.22031e+09
+Grid : Message : 99.164116 s : Average mflops/s per call (full)         : 3.89722e+07
+Grid : Message : 99.164119 s : Average mflops/s per call per rank (full): 1.21788e+06
+Grid : Message : 99.164121 s : Average mflops/s per call per node (full): 4.87153e+06
+Grid : Message : 99.164123 s : WilsonFermion5D Stencil
+Grid : Message : 99.164126 s : WilsonFermion5D StencilEven
+Grid : Message : 99.164127 s : WilsonFermion5D StencilOdd
+Grid : Message : 99.164129 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 99.164131 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 99.164132 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 107.831263 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 107.831285 s : Called DwDag
+Grid : Message : 107.831286 s : norm dag result 12.0422
+Grid : Message : 107.843943 s : norm dag ref    12.0422
+Grid : Message : 107.846918 s : norm dag diff   7.13141e-14
+Grid : Message : 107.859773 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 107.920803 s : src_e0.499992
+Grid : Message : 107.999399 s : src_o0.500008
+Grid : Message : 108.158950 s : *********************************************************
+Grid : Message : 108.158990 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 108.159010 s : * Vectorising space-time by 8
+Grid : Message : 108.159020 s : * SINGLE precision 
+Grid : Message : 108.159030 s : * Using Overlapped Comms/Compute
+Grid : Message : 108.159040 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 108.159050 s : *********************************************************
+Grid : Message : 155.299677 s : Deo mflop/s =   3.55746e+07
+Grid : Message : 155.299707 s : Deo mflop/s per rank   1.11171e+06
+Grid : Message : 155.299709 s : Deo mflop/s per node   4.44682e+06
+Grid : Message : 155.299712 s : #### Dhop calls report 
+Grid : Message : 155.299714 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 155.299716 s : WilsonFermion5D TotalTime   /Calls        : 1575.96 us
+Grid : Message : 155.299718 s : WilsonFermion5D CommTime    /Calls        : 1069.71 us
+Grid : Message : 155.299720 s : WilsonFermion5D FaceTime    /Calls        : 290.248 us
+Grid : Message : 155.299722 s : WilsonFermion5D ComputeTime1/Calls        : 5.07466 us
+Grid : Message : 155.299724 s : WilsonFermion5D ComputeTime2/Calls        : 240.561 us
+Grid : Message : 155.299743 s : Average mflops/s per call                : 1.0125e+10
+Grid : Message : 155.299747 s : Average mflops/s per call per rank       : 3.16406e+08
+Grid : Message : 155.299749 s : Average mflops/s per call per node       : 1.26562e+09
+Grid : Message : 155.299751 s : Average mflops/s per call (full)         : 3.62167e+07
+Grid : Message : 155.299755 s : Average mflops/s per call per rank (full): 1.13177e+06
+Grid : Message : 155.299757 s : Average mflops/s per call per node (full): 4.52709e+06
+Grid : Message : 155.299760 s : WilsonFermion5D Stencil
+Grid : Message : 155.299761 s : WilsonFermion5D StencilEven
+Grid : Message : 155.299764 s : WilsonFermion5D StencilOdd
+Grid : Message : 155.299765 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 155.299769 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 155.299771 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 155.318224 s : r_e6.02129
+Grid : Message : 155.320491 s : r_o6.02097
+Grid : Message : 155.321893 s : res12.0423
+Grid : Message : 155.423019 s : norm diff   0
+Grid : Message : 155.571243 s : norm diff even  0
+Grid : Message : 155.646003 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1035
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1035
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:40:37 BST 2022
+epoch 1661024437
--- a/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1035.64069/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:49:07 BST 2022
+epoch 1661024947
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc42f61000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014983c344000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014983bf7c000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014983ba8a000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014983b760000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014983b47f000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014983b21e000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014983c2cb000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014983ae3e000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001498396e2000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000149839312000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000149839071000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000149838f46000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000149838bc4000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014983898d000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000149838775000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000149838555000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000149838190000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000149837f8c000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014983c194000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000149837d84000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014983c1ff000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014983c1fa000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000149837c78000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000149837a6e000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014983786a000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d20000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.296325 s : Grid Layout
+Grid : Message : 1.296329 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.296334 s : 	OpenMP threads       : 4
+Grid : Message : 1.296336 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.308991 s : Making s innermost grids
+Grid : Message : 1.325119 s : Initialising 4d RNG
+Grid : Message : 1.341243 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.341264 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.572667 s : Initialising 5d RNG
+Grid : Message : 1.806486 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.806513 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.457170 s : Initialised RNGs
+Grid : Message : 5.379782 s : Drawing gauge field
+Grid : Message : 5.475278 s : Random gauge initialised 
+Grid : Message : 5.480285 s : Setting up Cshift based reference 
+Grid : Message : 10.637374 s : *****************************************************************
+Grid : Message : 10.637392 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.637393 s : *****************************************************************
+Grid : Message : 10.637394 s : *****************************************************************
+Grid : Message : 10.637395 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.637396 s : * Vectorising space-time by 8
+Grid : Message : 10.637397 s : * VComplexF size is 64 B
+Grid : Message : 10.637398 s : * SINGLE precision 
+Grid : Message : 10.637399 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.637400 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.637401 s : *****************************************************************
+Grid : Message : 11.209877 s : Called warmup
+Grid : Message : 98.239599 s : Called Dw 30000 times in 8.70295e+07 us
+Grid : Message : 98.239671 s : mflop/s =   3.86468e+07
+Grid : Message : 98.239673 s : mflop/s per rank =  1.20771e+06
+Grid : Message : 98.239675 s : mflop/s per node =  4.83085e+06
+Grid : Message : 98.239677 s : RF  GiB/s (base 2) =   78529.4
+Grid : Message : 98.239679 s : mem GiB/s (base 2) =   49080.9
+Grid : Message : 98.240251 s : norm diff   1.05759e-13
+Grid : Message : 98.250051 s : #### Dhop calls report 
+Grid : Message : 98.250058 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 98.250061 s : WilsonFermion5D TotalTime   /Calls        : 1451.95 us
+Grid : Message : 98.250063 s : WilsonFermion5D CommTime    /Calls        : 1009.67 us
+Grid : Message : 98.250065 s : WilsonFermion5D FaceTime    /Calls        : 219.662 us
+Grid : Message : 98.250067 s : WilsonFermion5D ComputeTime1/Calls        : 2.86259 us
+Grid : Message : 98.250069 s : WilsonFermion5D ComputeTime2/Calls        : 235.372 us
+Grid : Message : 98.250147 s : Average mflops/s per call                : 1.76785e+10
+Grid : Message : 98.250151 s : Average mflops/s per call per rank       : 5.52452e+08
+Grid : Message : 98.250153 s : Average mflops/s per call per node       : 2.20981e+09
+Grid : Message : 98.250155 s : Average mflops/s per call (full)         : 3.93098e+07
+Grid : Message : 98.250157 s : Average mflops/s per call per rank (full): 1.22843e+06
+Grid : Message : 98.250159 s : Average mflops/s per call per node (full): 4.91373e+06
+Grid : Message : 98.250161 s : WilsonFermion5D Stencil
+Grid : Message : 98.250162 s : WilsonFermion5D StencilEven
+Grid : Message : 98.250163 s : WilsonFermion5D StencilOdd
+Grid : Message : 98.250164 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 98.250165 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 98.250166 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 106.979591 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 106.979614 s : Called DwDag
+Grid : Message : 106.979615 s : norm dag result 12.0422
+Grid : Message : 106.986186 s : norm dag ref    12.0422
+Grid : Message : 106.989233 s : norm dag diff   7.13141e-14
+Grid : Message : 107.267400 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 107.684690 s : src_e0.499992
+Grid : Message : 107.131208 s : src_o0.500008
+Grid : Message : 107.147828 s : *********************************************************
+Grid : Message : 107.147833 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 107.147834 s : * Vectorising space-time by 8
+Grid : Message : 107.147836 s : * SINGLE precision 
+Grid : Message : 107.147837 s : * Using Overlapped Comms/Compute
+Grid : Message : 107.147839 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 107.147840 s : *********************************************************
+Grid : Message : 154.983680 s : Deo mflop/s =   3.58274e+07
+Grid : Message : 154.984010 s : Deo mflop/s per rank   1.11961e+06
+Grid : Message : 154.984030 s : Deo mflop/s per node   4.47843e+06
+Grid : Message : 154.984060 s : #### Dhop calls report 
+Grid : Message : 154.984080 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 154.984100 s : WilsonFermion5D TotalTime   /Calls        : 1564.83 us
+Grid : Message : 154.984120 s : WilsonFermion5D CommTime    /Calls        : 1057.75 us
+Grid : Message : 154.984140 s : WilsonFermion5D FaceTime    /Calls        : 290.957 us
+Grid : Message : 154.984160 s : WilsonFermion5D ComputeTime1/Calls        : 5.01747 us
+Grid : Message : 154.984180 s : WilsonFermion5D ComputeTime2/Calls        : 240.039 us
+Grid : Message : 154.984370 s : Average mflops/s per call                : 1.01412e+10
+Grid : Message : 154.984410 s : Average mflops/s per call per rank       : 3.16914e+08
+Grid : Message : 154.984430 s : Average mflops/s per call per node       : 1.26766e+09
+Grid : Message : 154.984450 s : Average mflops/s per call (full)         : 3.64742e+07
+Grid : Message : 154.984490 s : Average mflops/s per call per rank (full): 1.13982e+06
+Grid : Message : 154.984510 s : Average mflops/s per call per node (full): 4.55927e+06
+Grid : Message : 154.984530 s : WilsonFermion5D Stencil
+Grid : Message : 154.984540 s : WilsonFermion5D StencilEven
+Grid : Message : 154.984570 s : WilsonFermion5D StencilOdd
+Grid : Message : 154.984590 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 154.984630 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 154.984660 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 154.116284 s : r_e6.02129
+Grid : Message : 154.118064 s : r_o6.02097
+Grid : Message : 154.119490 s : res12.0423
+Grid : Message : 154.225189 s : norm diff   0
+Grid : Message : 154.355387 s : norm diff even  0
+Grid : Message : 154.439041 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1050
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1050
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:46:26 BST 2022
+epoch 1661024786
--- a/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1050.64073/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:54:56 BST 2022
+epoch 1661025296
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007fff1dbee000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000146b8752d000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000146b87165000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000146b86c73000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000146b86949000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000146b86668000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000146b86407000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000146b874b4000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000146b86027000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000146b848cb000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000146b844fb000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000146b8425a000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000146b8412f000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000146b83dad000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000146b83b76000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000146b8395e000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000146b8373e000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000146b83379000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000146b83175000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000146b8737d000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000146b82f6d000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000146b873e8000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000146b873e3000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000146b82e61000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000146b82c57000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000146b82a53000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146980000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.339201 s : Grid Layout
+Grid : Message : 1.339206 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.339210 s : 	OpenMP threads       : 4
+Grid : Message : 1.339212 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.351308 s : Making s innermost grids
+Grid : Message : 1.363723 s : Initialising 4d RNG
+Grid : Message : 1.381317 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.381342 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.507048 s : Initialising 5d RNG
+Grid : Message : 1.737129 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.737157 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.404513 s : Initialised RNGs
+Grid : Message : 5.265286 s : Drawing gauge field
+Grid : Message : 5.412925 s : Random gauge initialised 
+Grid : Message : 5.422103 s : Setting up Cshift based reference 
+Grid : Message : 10.470693 s : *****************************************************************
+Grid : Message : 10.470713 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.470714 s : *****************************************************************
+Grid : Message : 10.470715 s : *****************************************************************
+Grid : Message : 10.470716 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.470717 s : * Vectorising space-time by 8
+Grid : Message : 10.470718 s : * VComplexF size is 64 B
+Grid : Message : 10.470719 s : * SINGLE precision 
+Grid : Message : 10.470720 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.470721 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.470722 s : *****************************************************************
+Grid : Message : 10.906815 s : Called warmup
+Grid : Message : 97.838247 s : Called Dw 30000 times in 8.69313e+07 us
+Grid : Message : 97.838307 s : mflop/s =   3.86905e+07
+Grid : Message : 97.838310 s : mflop/s per rank =  1.20908e+06
+Grid : Message : 97.838315 s : mflop/s per node =  4.83631e+06
+Grid : Message : 97.838318 s : RF  GiB/s (base 2) =   78618.2
+Grid : Message : 97.838320 s : mem GiB/s (base 2) =   49136.3
+Grid : Message : 97.838895 s : norm diff   1.05759e-13
+Grid : Message : 97.848190 s : #### Dhop calls report 
+Grid : Message : 97.848197 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 97.848205 s : WilsonFermion5D TotalTime   /Calls        : 1449.98 us
+Grid : Message : 97.848209 s : WilsonFermion5D CommTime    /Calls        : 1010.05 us
+Grid : Message : 97.848212 s : WilsonFermion5D FaceTime    /Calls        : 217.72 us
+Grid : Message : 97.848214 s : WilsonFermion5D ComputeTime1/Calls        : 2.71694 us
+Grid : Message : 97.848216 s : WilsonFermion5D ComputeTime2/Calls        : 235.209 us
+Grid : Message : 97.848291 s : Average mflops/s per call                : 1.77649e+10
+Grid : Message : 97.848295 s : Average mflops/s per call per rank       : 5.55152e+08
+Grid : Message : 97.848297 s : Average mflops/s per call per node       : 2.22061e+09
+Grid : Message : 97.848300 s : Average mflops/s per call (full)         : 3.93633e+07
+Grid : Message : 97.848304 s : Average mflops/s per call per rank (full): 1.2301e+06
+Grid : Message : 97.848307 s : Average mflops/s per call per node (full): 4.92041e+06
+Grid : Message : 97.848310 s : WilsonFermion5D Stencil
+Grid : Message : 97.848311 s : WilsonFermion5D StencilEven
+Grid : Message : 97.848313 s : WilsonFermion5D StencilOdd
+Grid : Message : 97.848316 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 97.848321 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 97.848324 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 106.574196 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 106.574219 s : Called DwDag
+Grid : Message : 106.574220 s : norm dag result 12.0422
+Grid : Message : 106.576572 s : norm dag ref    12.0422
+Grid : Message : 106.579538 s : norm dag diff   7.13141e-14
+Grid : Message : 106.590622 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 106.652704 s : src_e0.499992
+Grid : Message : 106.718436 s : src_o0.500008
+Grid : Message : 106.735418 s : *********************************************************
+Grid : Message : 106.735421 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 106.735423 s : * Vectorising space-time by 8
+Grid : Message : 106.735424 s : * SINGLE precision 
+Grid : Message : 106.735425 s : * Using Overlapped Comms/Compute
+Grid : Message : 106.735426 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 106.735427 s : *********************************************************
+Grid : Message : 153.564697 s : Deo mflop/s =   3.59196e+07
+Grid : Message : 153.564730 s : Deo mflop/s per rank   1.12249e+06
+Grid : Message : 153.564732 s : Deo mflop/s per node   4.48995e+06
+Grid : Message : 153.564735 s : #### Dhop calls report 
+Grid : Message : 153.564737 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 153.564739 s : WilsonFermion5D TotalTime   /Calls        : 1560.8 us
+Grid : Message : 153.564741 s : WilsonFermion5D CommTime    /Calls        : 1055.83 us
+Grid : Message : 153.564743 s : WilsonFermion5D FaceTime    /Calls        : 290.091 us
+Grid : Message : 153.564745 s : WilsonFermion5D ComputeTime1/Calls        : 4.74968 us
+Grid : Message : 153.564747 s : WilsonFermion5D ComputeTime2/Calls        : 239.675 us
+Grid : Message : 153.564770 s : Average mflops/s per call                : 1.01665e+10
+Grid : Message : 153.564774 s : Average mflops/s per call per rank       : 3.17702e+08
+Grid : Message : 153.564776 s : Average mflops/s per call per node       : 1.27081e+09
+Grid : Message : 153.564778 s : Average mflops/s per call (full)         : 3.65685e+07
+Grid : Message : 153.564782 s : Average mflops/s per call per rank (full): 1.14277e+06
+Grid : Message : 153.564785 s : Average mflops/s per call per node (full): 4.57107e+06
+Grid : Message : 153.564787 s : WilsonFermion5D Stencil
+Grid : Message : 153.564789 s : WilsonFermion5D StencilEven
+Grid : Message : 153.564792 s : WilsonFermion5D StencilOdd
+Grid : Message : 153.564794 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 153.564795 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 153.564796 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 153.584150 s : r_e6.02129
+Grid : Message : 153.586497 s : r_o6.02097
+Grid : Message : 153.587837 s : res12.0423
+Grid : Message : 153.699087 s : norm diff   0
+Grid : Message : 153.830654 s : norm diff even  0
+Grid : Message : 153.894387 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1065
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1065
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:52:16 BST 2022
+epoch 1661025136
--- a/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1065.64078/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:00:44 BST 2022
+epoch 1661025644
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe693be000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a11518b000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a114dc3000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a1148d1000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a1145a7000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a1142c6000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a114065000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a115112000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a113c85000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a112529000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a112159000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a111eb8000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a111d8d000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014a111a0b000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a1117d4000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a1115bc000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a11139c000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014a110fd7000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014a110dd3000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014a114fdb000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014a110bcb000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a115046000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a115041000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a110abf000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a1108b5000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014a1106b1000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/log
@@ -0,0 +1,254 @@
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ffa0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.565327 s : Grid Layout
+Grid : Message : 1.565331 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.565336 s : 	OpenMP threads       : 4
+Grid : Message : 1.565338 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.576732 s : Making s innermost grids
+Grid : Message : 1.591292 s : Initialising 4d RNG
+Grid : Message : 1.607386 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.607406 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.733296 s : Initialising 5d RNG
+Grid : Message : 1.967786 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.967813 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.633889 s : Initialised RNGs
+Grid : Message : 5.699185 s : Drawing gauge field
+Grid : Message : 5.800869 s : Random gauge initialised 
+Grid : Message : 5.804955 s : Setting up Cshift based reference 
+Grid : Message : 10.808527 s : *****************************************************************
+Grid : Message : 10.808549 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.808551 s : *****************************************************************
+Grid : Message : 10.808553 s : *****************************************************************
+Grid : Message : 10.808554 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.808562 s : * Vectorising space-time by 8
+Grid : Message : 10.808564 s : * VComplexF size is 64 B
+Grid : Message : 10.808566 s : * SINGLE precision 
+Grid : Message : 10.808568 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.808570 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.808572 s : *****************************************************************
+Grid : Message : 11.365381 s : Called warmup
+Grid : Message : 97.739052 s : Called Dw 30000 times in 8.63735e+07 us
+Grid : Message : 97.739106 s : mflop/s =   3.89403e+07
+Grid : Message : 97.739108 s : mflop/s per rank =  1.21688e+06
+Grid : Message : 97.739110 s : mflop/s per node =  4.86754e+06
+Grid : Message : 97.739112 s : RF  GiB/s (base 2) =   79125.8
+Grid : Message : 97.739114 s : mem GiB/s (base 2) =   49453.6
+Grid : Message : 97.739684 s : norm diff   1.05759e-13
+Grid : Message : 97.749608 s : #### Dhop calls report 
+Grid : Message : 97.749616 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 97.749620 s : WilsonFermion5D TotalTime   /Calls        : 1440.85 us
+Grid : Message : 97.749622 s : WilsonFermion5D CommTime    /Calls        : 1003.73 us
+Grid : Message : 97.749624 s : WilsonFermion5D FaceTime    /Calls        : 215.311 us
+Grid : Message : 97.749626 s : WilsonFermion5D ComputeTime1/Calls        : 3.08775 us
+Grid : Message : 97.749628 s : WilsonFermion5D ComputeTime2/Calls        : 234.501 us
+Grid : Message : 97.749646 s : Average mflops/s per call                : 1.78099e+10
+Grid : Message : 97.749650 s : Average mflops/s per call per rank       : 5.5656e+08
+Grid : Message : 97.749652 s : Average mflops/s per call per node       : 2.22624e+09
+Grid : Message : 97.749656 s : Average mflops/s per call (full)         : 3.96128e+07
+Grid : Message : 97.749659 s : Average mflops/s per call per rank (full): 1.2379e+06
+Grid : Message : 97.749661 s : Average mflops/s per call per node (full): 4.9516e+06
+Grid : Message : 97.749663 s : WilsonFermion5D Stencil
+Grid : Message : 97.749665 s : WilsonFermion5D StencilEven
+Grid : Message : 97.749668 s : WilsonFermion5D StencilOdd
+Grid : Message : 97.749670 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 97.749672 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 97.749675 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 106.415478 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 106.415502 s : Called DwDag
+Grid : Message : 106.415503 s : norm dag result 12.0422
+Grid : Message : 106.429244 s : norm dag ref    12.0422
+Grid : Message : 106.432306 s : norm dag diff   7.13141e-14
+Grid : Message : 106.447571 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 106.514419 s : src_e0.499992
+Grid : Message : 106.579087 s : src_o0.500008
+Grid : Message : 106.595293 s : *********************************************************
+Grid : Message : 106.595296 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 106.595297 s : * Vectorising space-time by 8
+Grid : Message : 106.595298 s : * SINGLE precision 
+Grid : Message : 106.595299 s : * Using Overlapped Comms/Compute
+Grid : Message : 106.595300 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 106.595301 s : *********************************************************
+Grid : Message : 153.296330 s : Deo mflop/s =   3.62241e+07
+Grid : Message : 153.296850 s : Deo mflop/s per rank   1.132e+06
+Grid : Message : 153.296870 s : Deo mflop/s per node   4.52801e+06
+Grid : Message : 153.296900 s : #### Dhop calls report 
+Grid : Message : 153.296920 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 153.296940 s : WilsonFermion5D TotalTime   /Calls        : 1547.63 us
+Grid : Message : 153.296960 s : WilsonFermion5D CommTime    /Calls        : 1046.74 us
+Grid : Message : 153.296980 s : WilsonFermion5D FaceTime    /Calls        : 286.366 us
+Grid : Message : 153.297020 s : WilsonFermion5D ComputeTime1/Calls        : 4.8817 us
+Grid : Message : 153.297040 s : WilsonFermion5D ComputeTime2/Calls        : 238.437 us
+Grid : Message : 153.297230 s : Average mflops/s per call                : 1.02065e+10
+Grid : Message : 153.297270 s : Average mflops/s per call per rank       : 3.18952e+08
+Grid : Message : 153.297290 s : Average mflops/s per call per node       : 1.27581e+09
+Grid : Message : 153.297320 s : Average mflops/s per call (full)         : 3.68796e+07
+Grid : Message : 153.297340 s : Average mflops/s per call per rank (full): 1.15249e+06
+Grid : Message : 153.297360 s : Average mflops/s per call per node (full): 4.60995e+06
+Grid : Message : 153.297400 s : WilsonFermion5D Stencil
+Grid : Message : 153.297410 s : WilsonFermion5D StencilEven
+Grid : Message : 153.297420 s : WilsonFermion5D StencilOdd
+Grid : Message : 153.297430 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 153.297460 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 153.297490 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 153.487210 s : r_e6.02129
+Grid : Message : 153.503240 s : r_o6.02097
+Grid : Message : 153.516860 s : res12.0423
+Grid : Message : 153.160184 s : norm diff   0
+Grid : Message : 153.295561 s : norm diff even  0
+Grid : Message : 153.362804 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1080
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1080
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 20:58:05 BST 2022
+epoch 1661025485
--- a/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1080.64084/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:06:32 BST 2022
+epoch 1661025992
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x000015143f705000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015143f685000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015143f2c3000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015143edd1000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015143eaa7000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015143e7c6000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015143e565000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015143f60c000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015143e185000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000015143ca29000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015143c659000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015143c3b8000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015143c28d000)
+	libm.so.6 => /lib64/libm.so.6 (0x000015143bf0b000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015143bcd4000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015143babc000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000015143b89c000)
+	libc.so.6 => /lib64/libc.so.6 (0x000015143b4d7000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000015143b2d3000)
+	/lib64/ld-linux-x86-64.so.2 (0x000015143f4db000)
+	librt.so.1 => /lib64/librt.so.1 (0x000015143b0cb000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015143f540000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015143f53b000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015143afbf000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015143adb5000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000015143abb1000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14b540000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.329290 s : Grid Layout
+Grid : Message : 1.329294 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.329301 s : 	OpenMP threads       : 4
+Grid : Message : 1.329304 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.341902 s : Making s innermost grids
+Grid : Message : 1.358246 s : Initialising 4d RNG
+Grid : Message : 1.374403 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.374426 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.619784 s : Initialising 5d RNG
+Grid : Message : 1.851516 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.851543 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.778347 s : Initialised RNGs
+Grid : Message : 5.412229 s : Drawing gauge field
+Grid : Message : 5.498501 s : Random gauge initialised 
+Grid : Message : 5.502681 s : Setting up Cshift based reference 
+Grid : Message : 10.568254 s : *****************************************************************
+Grid : Message : 10.568272 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.568273 s : *****************************************************************
+Grid : Message : 10.568274 s : *****************************************************************
+Grid : Message : 10.568275 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.568276 s : * Vectorising space-time by 8
+Grid : Message : 10.568277 s : * VComplexF size is 64 B
+Grid : Message : 10.568278 s : * SINGLE precision 
+Grid : Message : 10.568279 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.568280 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.568281 s : *****************************************************************
+Grid : Message : 11.130225 s : Called warmup
+Grid : Message : 96.935259 s : Called Dw 30000 times in 8.58048e+07 us
+Grid : Message : 96.935312 s : mflop/s =   3.91984e+07
+Grid : Message : 96.935314 s : mflop/s per rank =  1.22495e+06
+Grid : Message : 96.935316 s : mflop/s per node =  4.8998e+06
+Grid : Message : 96.935318 s : RF  GiB/s (base 2) =   79650.3
+Grid : Message : 96.935320 s : mem GiB/s (base 2) =   49781.4
+Grid : Message : 96.935891 s : norm diff   1.05759e-13
+Grid : Message : 96.945419 s : #### Dhop calls report 
+Grid : Message : 96.945427 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 96.945430 s : WilsonFermion5D TotalTime   /Calls        : 1431.16 us
+Grid : Message : 96.945432 s : WilsonFermion5D CommTime    /Calls        : 992.268 us
+Grid : Message : 96.945434 s : WilsonFermion5D FaceTime    /Calls        : 217.135 us
+Grid : Message : 96.945436 s : WilsonFermion5D ComputeTime1/Calls        : 2.70928 us
+Grid : Message : 96.945438 s : WilsonFermion5D ComputeTime2/Calls        : 234.653 us
+Grid : Message : 96.945454 s : Average mflops/s per call                : 1.75489e+10
+Grid : Message : 96.945461 s : Average mflops/s per call per rank       : 5.48402e+08
+Grid : Message : 96.945464 s : Average mflops/s per call per node       : 2.19361e+09
+Grid : Message : 96.945466 s : Average mflops/s per call (full)         : 3.98811e+07
+Grid : Message : 96.945469 s : Average mflops/s per call per rank (full): 1.24628e+06
+Grid : Message : 96.945471 s : Average mflops/s per call per node (full): 4.98513e+06
+Grid : Message : 96.945473 s : WilsonFermion5D Stencil
+Grid : Message : 96.945475 s : WilsonFermion5D StencilEven
+Grid : Message : 96.945477 s : WilsonFermion5D StencilOdd
+Grid : Message : 96.945478 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 96.945479 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 96.945481 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 105.614164 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 105.614186 s : Called DwDag
+Grid : Message : 105.614187 s : norm dag result 12.0422
+Grid : Message : 105.616525 s : norm dag ref    12.0422
+Grid : Message : 105.619641 s : norm dag diff   7.13141e-14
+Grid : Message : 105.629645 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 105.695112 s : src_e0.499992
+Grid : Message : 105.762145 s : src_o0.500008
+Grid : Message : 105.778422 s : *********************************************************
+Grid : Message : 105.778425 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 105.778429 s : * Vectorising space-time by 8
+Grid : Message : 105.778431 s : * SINGLE precision 
+Grid : Message : 105.778432 s : * Using Overlapped Comms/Compute
+Grid : Message : 105.778434 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 105.778436 s : *********************************************************
+Grid : Message : 151.816932 s : Deo mflop/s =   3.6536e+07
+Grid : Message : 151.816963 s : Deo mflop/s per rank   1.14175e+06
+Grid : Message : 151.816965 s : Deo mflop/s per node   4.567e+06
+Grid : Message : 151.816967 s : #### Dhop calls report 
+Grid : Message : 151.816969 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 151.816971 s : WilsonFermion5D TotalTime   /Calls        : 1534.48 us
+Grid : Message : 151.816973 s : WilsonFermion5D CommTime    /Calls        : 1033.55 us
+Grid : Message : 151.816975 s : WilsonFermion5D FaceTime    /Calls        : 286.448 us
+Grid : Message : 151.816977 s : WilsonFermion5D ComputeTime1/Calls        : 4.73748 us
+Grid : Message : 151.816979 s : WilsonFermion5D ComputeTime2/Calls        : 238.502 us
+Grid : Message : 151.817000 s : Average mflops/s per call                : 1.02127e+10
+Grid : Message : 151.817004 s : Average mflops/s per call per rank       : 3.19146e+08
+Grid : Message : 151.817006 s : Average mflops/s per call per node       : 1.27658e+09
+Grid : Message : 151.817008 s : Average mflops/s per call (full)         : 3.71958e+07
+Grid : Message : 151.817013 s : Average mflops/s per call per rank (full): 1.16237e+06
+Grid : Message : 151.817016 s : Average mflops/s per call per node (full): 4.64947e+06
+Grid : Message : 151.817018 s : WilsonFermion5D Stencil
+Grid : Message : 151.817020 s : WilsonFermion5D StencilEven
+Grid : Message : 151.817022 s : WilsonFermion5D StencilOdd
+Grid : Message : 151.817025 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 151.817026 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 151.817027 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 151.834960 s : r_e6.02129
+Grid : Message : 151.836627 s : r_o6.02097
+Grid : Message : 151.837999 s : res12.0423
+Grid : Message : 151.953376 s : norm diff   0
+Grid : Message : 152.791770 s : norm diff even  0
+Grid : Message : 152.145659 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1095
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1095
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:03:54 BST 2022
+epoch 1661025834
--- a/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1095.64089/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:12:17 BST 2022
+epoch 1661026337
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffdd3edf000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015078a487000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015078a0bf000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000150789bcd000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001507898a3000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001507895c2000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000150789361000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015078a40e000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000150788f81000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000150787825000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000150787455000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001507871b4000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000150787089000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000150786d07000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000150786ad0000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001507868b8000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000150786698000)
+	libc.so.6 => /lib64/libc.so.6 (0x00001507862d3000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x00001507860cf000)
+	/lib64/ld-linux-x86-64.so.2 (0x000015078a2d7000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000150785ec7000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015078a342000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015078a33d000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000150785dbb000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000150785bb1000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x00001507859ad000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14bc20000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.459618 s : Grid Layout
+Grid : Message : 1.459622 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.459626 s : 	OpenMP threads       : 4
+Grid : Message : 1.459627 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.477216 s : Making s innermost grids
+Grid : Message : 1.489637 s : Initialising 4d RNG
+Grid : Message : 1.507425 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.507447 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.702250 s : Initialising 5d RNG
+Grid : Message : 1.945333 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.945362 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.556125 s : Initialised RNGs
+Grid : Message : 5.465171 s : Drawing gauge field
+Grid : Message : 5.580137 s : Random gauge initialised 
+Grid : Message : 5.588368 s : Setting up Cshift based reference 
+Grid : Message : 10.584296 s : *****************************************************************
+Grid : Message : 10.584315 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.584317 s : *****************************************************************
+Grid : Message : 10.584318 s : *****************************************************************
+Grid : Message : 10.584319 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.584320 s : * Vectorising space-time by 8
+Grid : Message : 10.584321 s : * VComplexF size is 64 B
+Grid : Message : 10.584322 s : * SINGLE precision 
+Grid : Message : 10.584323 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.584324 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.584325 s : *****************************************************************
+Grid : Message : 11.140229 s : Called warmup
+Grid : Message : 95.842020 s : Called Dw 30000 times in 8.47016e+07 us
+Grid : Message : 95.842070 s : mflop/s =   3.9709e+07
+Grid : Message : 95.842072 s : mflop/s per rank =  1.24091e+06
+Grid : Message : 95.842074 s : mflop/s per node =  4.96362e+06
+Grid : Message : 95.842076 s : RF  GiB/s (base 2) =   80687.7
+Grid : Message : 95.842078 s : mem GiB/s (base 2) =   50429.8
+Grid : Message : 95.842652 s : norm diff   1.05759e-13
+Grid : Message : 95.852740 s : #### Dhop calls report 
+Grid : Message : 95.852747 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 95.852750 s : WilsonFermion5D TotalTime   /Calls        : 1412.93 us
+Grid : Message : 95.852752 s : WilsonFermion5D CommTime    /Calls        : 973.117 us
+Grid : Message : 95.852754 s : WilsonFermion5D FaceTime    /Calls        : 219.979 us
+Grid : Message : 95.852756 s : WilsonFermion5D ComputeTime1/Calls        : 2.81676 us
+Grid : Message : 95.852758 s : WilsonFermion5D ComputeTime2/Calls        : 233.384 us
+Grid : Message : 95.852786 s : Average mflops/s per call                : 1.78331e+10
+Grid : Message : 95.852789 s : Average mflops/s per call per rank       : 5.57284e+08
+Grid : Message : 95.852791 s : Average mflops/s per call per node       : 2.22914e+09
+Grid : Message : 95.852793 s : Average mflops/s per call (full)         : 4.03955e+07
+Grid : Message : 95.852795 s : Average mflops/s per call per rank (full): 1.26236e+06
+Grid : Message : 95.852797 s : Average mflops/s per call per node (full): 5.04944e+06
+Grid : Message : 95.852801 s : WilsonFermion5D Stencil
+Grid : Message : 95.852803 s : WilsonFermion5D StencilEven
+Grid : Message : 95.852805 s : WilsonFermion5D StencilOdd
+Grid : Message : 95.852809 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 95.852811 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 95.852814 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 104.522368 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 104.522390 s : Called DwDag
+Grid : Message : 104.522391 s : norm dag result 12.0422
+Grid : Message : 104.526025 s : norm dag ref    12.0422
+Grid : Message : 104.528967 s : norm dag diff   7.13141e-14
+Grid : Message : 104.538859 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 104.603735 s : src_e0.499992
+Grid : Message : 104.679776 s : src_o0.500008
+Grid : Message : 104.696490 s : *********************************************************
+Grid : Message : 104.696495 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 104.696498 s : * Vectorising space-time by 8
+Grid : Message : 104.696500 s : * SINGLE precision 
+Grid : Message : 104.696502 s : * Using Overlapped Comms/Compute
+Grid : Message : 104.696504 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 104.696506 s : *********************************************************
+Grid : Message : 150.182043 s : Deo mflop/s =   3.69801e+07
+Grid : Message : 150.182074 s : Deo mflop/s per rank   1.15563e+06
+Grid : Message : 150.182079 s : Deo mflop/s per node   4.62251e+06
+Grid : Message : 150.182082 s : #### Dhop calls report 
+Grid : Message : 150.182085 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 150.182089 s : WilsonFermion5D TotalTime   /Calls        : 1516 us
+Grid : Message : 150.182093 s : WilsonFermion5D CommTime    /Calls        : 1019.81 us
+Grid : Message : 150.182096 s : WilsonFermion5D FaceTime    /Calls        : 284.065 us
+Grid : Message : 150.182100 s : WilsonFermion5D ComputeTime1/Calls        : 4.84424 us
+Grid : Message : 150.182103 s : WilsonFermion5D ComputeTime2/Calls        : 236.64 us
+Grid : Message : 150.182126 s : Average mflops/s per call                : 1.01614e+10
+Grid : Message : 150.182129 s : Average mflops/s per call per rank       : 3.17542e+08
+Grid : Message : 150.182131 s : Average mflops/s per call per node       : 1.27017e+09
+Grid : Message : 150.182133 s : Average mflops/s per call (full)         : 3.76491e+07
+Grid : Message : 150.182135 s : Average mflops/s per call per rank (full): 1.17653e+06
+Grid : Message : 150.182139 s : Average mflops/s per call per node (full): 4.70614e+06
+Grid : Message : 150.182141 s : WilsonFermion5D Stencil
+Grid : Message : 150.182142 s : WilsonFermion5D StencilEven
+Grid : Message : 150.182143 s : WilsonFermion5D StencilOdd
+Grid : Message : 150.182144 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 150.182145 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 150.182146 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 150.201118 s : r_e6.02129
+Grid : Message : 150.202964 s : r_o6.02097
+Grid : Message : 150.204336 s : res12.0423
+Grid : Message : 150.311999 s : norm diff   0
+Grid : Message : 150.449845 s : norm diff even  0
+Grid : Message : 150.531177 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1110
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1110
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:09:40 BST 2022
+epoch 1661026180
--- a/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1110.64093/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:18:02 BST 2022
+epoch 1661026682
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x0000145d92144000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000145d920c4000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000145d91d02000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000145d91810000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000145d914e6000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000145d91205000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000145d90fa4000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000145d9204b000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000145d90bc4000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000145d8f468000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000145d8f098000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000145d8edf7000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000145d8eccc000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000145d8e94a000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000145d8e713000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000145d8e4fb000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000145d8e2db000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000145d8df16000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000145d8dd12000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000145d91f1a000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000145d8db0a000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000145d91f7f000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000145d91f7a000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000145d8d9fe000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000145d8d7f4000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000145d8d5f0000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 3 device 0 bus id: 0000:C4:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x149500000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.406631 s : Grid Layout
+Grid : Message : 1.406636 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.406643 s : 	OpenMP threads       : 4
+Grid : Message : 1.406646 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.417988 s : Making s innermost grids
+Grid : Message : 1.428288 s : Initialising 4d RNG
+Grid : Message : 1.444040 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.444063 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.681123 s : Initialising 5d RNG
+Grid : Message : 1.913861 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.913892 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.748405 s : Initialised RNGs
+Grid : Message : 5.494987 s : Drawing gauge field
+Grid : Message : 5.593455 s : Random gauge initialised 
+Grid : Message : 5.600656 s : Setting up Cshift based reference 
+Grid : Message : 10.608322 s : *****************************************************************
+Grid : Message : 10.608341 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.608342 s : *****************************************************************
+Grid : Message : 10.608343 s : *****************************************************************
+Grid : Message : 10.608344 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.608345 s : * Vectorising space-time by 8
+Grid : Message : 10.608346 s : * VComplexF size is 64 B
+Grid : Message : 10.608347 s : * SINGLE precision 
+Grid : Message : 10.608348 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.608349 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.608350 s : *****************************************************************
+Grid : Message : 11.168504 s : Called warmup
+Grid : Message : 94.732886 s : Called Dw 30000 times in 8.35639e+07 us
+Grid : Message : 94.732942 s : mflop/s =   4.02496e+07
+Grid : Message : 94.732944 s : mflop/s per rank =  1.2578e+06
+Grid : Message : 94.732946 s : mflop/s per node =  5.0312e+06
+Grid : Message : 94.732950 s : RF  GiB/s (base 2) =   81786.2
+Grid : Message : 94.732952 s : mem GiB/s (base 2) =   51116.4
+Grid : Message : 94.733524 s : norm diff   1.05759e-13
+Grid : Message : 94.743143 s : #### Dhop calls report 
+Grid : Message : 94.743150 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 94.743153 s : WilsonFermion5D TotalTime   /Calls        : 1394.23 us
+Grid : Message : 94.743155 s : WilsonFermion5D CommTime    /Calls        : 949.994 us
+Grid : Message : 94.743157 s : WilsonFermion5D FaceTime    /Calls        : 223.263 us
+Grid : Message : 94.743159 s : WilsonFermion5D ComputeTime1/Calls        : 2.79139 us
+Grid : Message : 94.743161 s : WilsonFermion5D ComputeTime2/Calls        : 233.597 us
+Grid : Message : 94.743253 s : Average mflops/s per call                : 1.76088e+10
+Grid : Message : 94.743257 s : Average mflops/s per call per rank       : 5.50276e+08
+Grid : Message : 94.743259 s : Average mflops/s per call per node       : 2.20111e+09
+Grid : Message : 94.743261 s : Average mflops/s per call (full)         : 4.09375e+07
+Grid : Message : 94.743263 s : Average mflops/s per call per rank (full): 1.2793e+06
+Grid : Message : 94.743266 s : Average mflops/s per call per node (full): 5.11718e+06
+Grid : Message : 94.743269 s : WilsonFermion5D Stencil
+Grid : Message : 94.743270 s : WilsonFermion5D StencilEven
+Grid : Message : 94.743272 s : WilsonFermion5D StencilOdd
+Grid : Message : 94.743275 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 94.743276 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 94.743279 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 103.414014 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 103.414035 s : Called DwDag
+Grid : Message : 103.414036 s : norm dag result 12.0422
+Grid : Message : 103.421887 s : norm dag ref    12.0422
+Grid : Message : 103.424914 s : norm dag diff   7.13141e-14
+Grid : Message : 103.435780 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 103.497971 s : src_e0.499992
+Grid : Message : 103.565487 s : src_o0.500008
+Grid : Message : 103.581935 s : *********************************************************
+Grid : Message : 103.581939 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 103.581946 s : * Vectorising space-time by 8
+Grid : Message : 103.581949 s : * SINGLE precision 
+Grid : Message : 103.581950 s : * Using Overlapped Comms/Compute
+Grid : Message : 103.581954 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 103.581955 s : *********************************************************
+Grid : Message : 148.394945 s : Deo mflop/s =   3.75373e+07
+Grid : Message : 148.394972 s : Deo mflop/s per rank   1.17304e+06
+Grid : Message : 148.394974 s : Deo mflop/s per node   4.69217e+06
+Grid : Message : 148.394977 s : #### Dhop calls report 
+Grid : Message : 148.394979 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 148.394981 s : WilsonFermion5D TotalTime   /Calls        : 1493.62 us
+Grid : Message : 148.394983 s : WilsonFermion5D CommTime    /Calls        : 994.118 us
+Grid : Message : 148.394985 s : WilsonFermion5D FaceTime    /Calls        : 286.093 us
+Grid : Message : 148.394987 s : WilsonFermion5D ComputeTime1/Calls        : 4.89217 us
+Grid : Message : 148.394989 s : WilsonFermion5D ComputeTime2/Calls        : 236.395 us
+Grid : Message : 148.395023 s : Average mflops/s per call                : 1.01808e+10
+Grid : Message : 148.395027 s : Average mflops/s per call per rank       : 3.18151e+08
+Grid : Message : 148.395030 s : Average mflops/s per call per node       : 1.2726e+09
+Grid : Message : 148.395032 s : Average mflops/s per call (full)         : 3.82132e+07
+Grid : Message : 148.395037 s : Average mflops/s per call per rank (full): 1.19416e+06
+Grid : Message : 148.395041 s : Average mflops/s per call per node (full): 4.77665e+06
+Grid : Message : 148.395044 s : WilsonFermion5D Stencil
+Grid : Message : 148.395046 s : WilsonFermion5D StencilEven
+Grid : Message : 148.395049 s : WilsonFermion5D StencilOdd
+Grid : Message : 148.395051 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 148.395054 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 148.395056 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 148.413425 s : r_e6.02129
+Grid : Message : 148.415423 s : r_o6.02097
+Grid : Message : 148.416796 s : res12.0423
+Grid : Message : 148.517046 s : norm diff   0
+Grid : Message : 148.660878 s : norm diff even  0
+Grid : Message : 148.726888 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1125
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1125
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:15:27 BST 2022
+epoch 1661026527
--- a/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1125.64097/success
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/app-hash
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/app-hash
@@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/elf
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/elf
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/end-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/end-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:23:44 BST 2022
+epoch 1661027024
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/env
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/env
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/ldd
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/ldd
@@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffccdae5000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001492ab336000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001492aaf6e000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001492aaa7c000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001492aa752000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001492aa471000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001492aa210000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001492ab2bd000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001492a9e30000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001492a86d4000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001492a8304000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001492a8063000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001492a7f38000)
+	libm.so.6 => /lib64/libm.so.6 (0x00001492a7bb6000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001492a797f000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001492a7767000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001492a7547000)
+	libc.so.6 => /lib64/libc.so.6 (0x00001492a7182000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x00001492a6f7e000)
+	/lib64/ld-linux-x86-64.so.2 (0x00001492ab186000)
+	librt.so.1 => /lib64/librt.so.1 (0x00001492a6d76000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001492ab1f1000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001492ab1ec000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001492a6c6a000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001492a6a60000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x00001492a685c000)
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/log
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/log
@@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ad80000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.503383 s : Grid Layout
+Grid : Message : 1.503386 s : 	Global lattice size  : 48 48 48 48 
+Grid : Message : 1.503391 s : 	OpenMP threads       : 4
+Grid : Message : 1.503393 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.516942 s : Making s innermost grids
+Grid : Message : 1.527235 s : Initialising 4d RNG
+Grid : Message : 1.544084 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.544106 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 1.726178 s : Initialising 5d RNG
+Grid : Message : 1.956255 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.956288 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 4.571228 s : Initialised RNGs
+Grid : Message : 5.568412 s : Drawing gauge field
+Grid : Message : 5.727363 s : Random gauge initialised 
+Grid : Message : 5.741177 s : Setting up Cshift based reference 
+Grid : Message : 10.690571 s : *****************************************************************
+Grid : Message : 10.690591 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 10.690592 s : *****************************************************************
+Grid : Message : 10.690593 s : *****************************************************************
+Grid : Message : 10.690594 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 10.690595 s : * Vectorising space-time by 8
+Grid : Message : 10.690596 s : * VComplexF size is 64 B
+Grid : Message : 10.690597 s : * SINGLE precision 
+Grid : Message : 10.690600 s : * Using Overlapped Comms/Compute
+Grid : Message : 10.690606 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 10.690607 s : *****************************************************************
+Grid : Message : 11.155061 s : Called warmup
+Grid : Message : 93.920472 s : Called Dw 30000 times in 8.27652e+07 us
+Grid : Message : 93.920540 s : mflop/s =   4.0638e+07
+Grid : Message : 93.920542 s : mflop/s per rank =  1.26994e+06
+Grid : Message : 93.920544 s : mflop/s per node =  5.07975e+06
+Grid : Message : 93.920546 s : RF  GiB/s (base 2) =   82575.4
+Grid : Message : 93.920548 s : mem GiB/s (base 2) =   51609.6
+Grid : Message : 93.921119 s : norm diff   1.05759e-13
+Grid : Message : 93.930750 s : #### Dhop calls report 
+Grid : Message : 93.930758 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 93.930761 s : WilsonFermion5D TotalTime   /Calls        : 1380.46 us
+Grid : Message : 93.930763 s : WilsonFermion5D CommTime    /Calls        : 935.406 us
+Grid : Message : 93.930765 s : WilsonFermion5D FaceTime    /Calls        : 223.911 us
+Grid : Message : 93.930767 s : WilsonFermion5D ComputeTime1/Calls        : 2.84526 us
+Grid : Message : 93.930769 s : WilsonFermion5D ComputeTime2/Calls        : 233.719 us
+Grid : Message : 93.930799 s : Average mflops/s per call                : 1.7744e+10
+Grid : Message : 93.930802 s : Average mflops/s per call per rank       : 5.54499e+08
+Grid : Message : 93.930804 s : Average mflops/s per call per node       : 2.218e+09
+Grid : Message : 93.930806 s : Average mflops/s per call (full)         : 4.13456e+07
+Grid : Message : 93.930810 s : Average mflops/s per call per rank (full): 1.29205e+06
+Grid : Message : 93.930812 s : Average mflops/s per call per node (full): 5.1682e+06
+Grid : Message : 93.930814 s : WilsonFermion5D Stencil
+Grid : Message : 93.930815 s : WilsonFermion5D StencilEven
+Grid : Message : 93.930817 s : WilsonFermion5D StencilOdd
+Grid : Message : 93.930818 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 93.930819 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 93.930820 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 102.631972 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 102.631993 s : Called DwDag
+Grid : Message : 102.631994 s : norm dag result 12.0422
+Grid : Message : 102.634228 s : norm dag ref    12.0422
+Grid : Message : 102.637138 s : norm dag diff   7.13141e-14
+Grid : Message : 102.646956 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 102.713481 s : src_e0.499992
+Grid : Message : 102.788976 s : src_o0.500008
+Grid : Message : 102.805384 s : *********************************************************
+Grid : Message : 102.805388 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 102.805391 s : * Vectorising space-time by 8
+Grid : Message : 102.805393 s : * SINGLE precision 
+Grid : Message : 102.805396 s : * Using Overlapped Comms/Compute
+Grid : Message : 102.805398 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 102.805400 s : *********************************************************
+Grid : Message : 147.202877 s : Deo mflop/s =   3.78867e+07
+Grid : Message : 147.202909 s : Deo mflop/s per rank   1.18396e+06
+Grid : Message : 147.202914 s : Deo mflop/s per node   4.73584e+06
+Grid : Message : 147.202918 s : #### Dhop calls report 
+Grid : Message : 147.202920 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 147.202923 s : WilsonFermion5D TotalTime   /Calls        : 1479.75 us
+Grid : Message : 147.202927 s : WilsonFermion5D CommTime    /Calls        : 975.318 us
+Grid : Message : 147.202929 s : WilsonFermion5D FaceTime    /Calls        : 293.474 us
+Grid : Message : 147.202932 s : WilsonFermion5D ComputeTime1/Calls        : 4.93714 us
+Grid : Message : 147.202935 s : WilsonFermion5D ComputeTime2/Calls        : 236.494 us
+Grid : Message : 147.202962 s : Average mflops/s per call                : 1.02376e+10
+Grid : Message : 147.202965 s : Average mflops/s per call per rank       : 3.19924e+08
+Grid : Message : 147.202967 s : Average mflops/s per call per node       : 1.2797e+09
+Grid : Message : 147.202969 s : Average mflops/s per call (full)         : 3.85713e+07
+Grid : Message : 147.202971 s : Average mflops/s per call per rank (full): 1.20535e+06
+Grid : Message : 147.202973 s : Average mflops/s per call per node (full): 4.82142e+06
+Grid : Message : 147.202976 s : WilsonFermion5D Stencil
+Grid : Message : 147.202978 s : WilsonFermion5D StencilEven
+Grid : Message : 147.202980 s : WilsonFermion5D StencilOdd
+Grid : Message : 147.202982 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 147.202985 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 147.202988 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 147.219874 s : r_e6.02129
+Grid : Message : 147.221715 s : r_o6.02097
+Grid : Message : 147.223077 s : res12.0423
+Grid : Message : 147.332465 s : norm diff   0
+Grid : Message : 147.471882 s : norm diff even  0
+Grid : Message : 147.546548 s : norm diff odd   0
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/nodes
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/nodes
@@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/script
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/script
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-8A-1140
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
+freq=1140
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 48.48.48.48 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/start-date
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/start-date
@@ -0,0 +1,2 @@
+Sat Aug 20 21:21:10 BST 2022
+epoch 1661026870
--- a/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/success
+++ b/2-racks/size-C0/8-nodes/job/power-8A-1140.64102/success
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32`