Initial commit

2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:04:53 BST 2022
+epoch 1661216693
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007fffb8bf1000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014bae8f40000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014bae8b78000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014bae8686000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014bae835c000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014bae807b000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014bae7e1a000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014bae8ec7000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014bae7a3a000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014bae62de000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014bae5f0e000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014bae5c6d000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014bae5b42000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014bae57c0000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014bae5589000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014bae5371000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014bae5151000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014bae4d8c000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014bae4b88000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014bae8d90000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014bae4980000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014bae8dfb000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014bae8df6000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014bae4874000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014bae466a000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014bae4466000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x152200000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.564675 s : Grid Layout
+Grid : Message : 1.564678 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.564685 s : 	OpenMP threads       : 4
+Grid : Message : 1.564688 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.603567 s : Making s innermost grids
+Grid : Message : 1.688222 s : Initialising 4d RNG
+Grid : Message : 1.786208 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.786240 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.391014 s : Initialising 5d RNG
+Grid : Message : 4.831522 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.831565 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 37.988173 s : Initialised RNGs
+Grid : Message : 42.605948 s : Drawing gauge field
+Grid : Message : 43.494632 s : Random gauge initialised 
+Grid : Message : 43.507832 s : Setting up Cshift based reference 
+Grid : Message : 72.502242 s : *****************************************************************
+Grid : Message : 72.502275 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.502277 s : *****************************************************************
+Grid : Message : 72.502278 s : *****************************************************************
+Grid : Message : 72.502279 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.502280 s : * Vectorising space-time by 8
+Grid : Message : 72.502281 s : * VComplexF size is 64 B
+Grid : Message : 72.502282 s : * SINGLE precision 
+Grid : Message : 72.502285 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.502286 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.502287 s : *****************************************************************
+Grid : Message : 74.519440 s : Called warmup
+Grid : Message : 363.130822 s : Called Dw 30000 times in 2.8861e+08 us
+Grid : Message : 363.130900 s : mflop/s =   1.47327e+08
+Grid : Message : 363.130902 s : mflop/s per rank =  2.30199e+06
+Grid : Message : 363.130904 s : mflop/s per node =  9.20796e+06
+Grid : Message : 363.130906 s : RF  GiB/s (base 2) =   299366
+Grid : Message : 363.130908 s : mem GiB/s (base 2) =   187104
+Grid : Message : 363.134410 s : norm diff   1.06407e-13
+Grid : Message : 363.184684 s : #### Dhop calls report 
+Grid : Message : 363.184691 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 363.184699 s : WilsonFermion5D TotalTime   /Calls        : 4813.86 us
+Grid : Message : 363.184703 s : WilsonFermion5D CommTime    /Calls        : 3339.84 us
+Grid : Message : 363.184707 s : WilsonFermion5D FaceTime    /Calls        : 483.232 us
+Grid : Message : 363.184711 s : WilsonFermion5D ComputeTime1/Calls        : 5.02507 us
+Grid : Message : 363.184715 s : WilsonFermion5D ComputeTime2/Calls        : 1003.93 us
+Grid : Message : 363.184807 s : Average mflops/s per call                : 1.25444e+11
+Grid : Message : 363.184812 s : Average mflops/s per call per rank       : 1.96006e+09
+Grid : Message : 363.184814 s : Average mflops/s per call per node       : 7.84022e+09
+Grid : Message : 363.184817 s : Average mflops/s per call (full)         : 1.49891e+08
+Grid : Message : 363.184821 s : Average mflops/s per call per rank (full): 2.34205e+06
+Grid : Message : 363.184824 s : Average mflops/s per call per node (full): 9.36819e+06
+Grid : Message : 363.184828 s : WilsonFermion5D Stencil
+Grid : Message : 363.184830 s : WilsonFermion5D StencilEven
+Grid : Message : 363.184832 s : WilsonFermion5D StencilOdd
+Grid : Message : 363.184834 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 363.184836 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 363.184838 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 418.891291 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 418.891319 s : Called DwDag
+Grid : Message : 418.891320 s : norm dag result 12.0421
+Grid : Message : 418.959967 s : norm dag ref    12.0421
+Grid : Message : 418.976362 s : norm dag diff   7.21924e-14
+Grid : Message : 419.178360 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 419.420606 s : src_e0.499998
+Grid : Message : 419.876063 s : src_o0.500002
+Grid : Message : 419.990496 s : *********************************************************
+Grid : Message : 419.990499 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 419.990500 s : * Vectorising space-time by 8
+Grid : Message : 419.990501 s : * SINGLE precision 
+Grid : Message : 419.990502 s : * Using Overlapped Comms/Compute
+Grid : Message : 419.990503 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 419.990504 s : *********************************************************
+Grid : Message : 563.432552 s : Deo mflop/s =   1.48268e+08
+Grid : Message : 563.432592 s : Deo mflop/s per rank   2.31669e+06
+Grid : Message : 563.432595 s : Deo mflop/s per node   9.26677e+06
+Grid : Message : 563.432601 s : #### Dhop calls report 
+Grid : Message : 563.432603 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 563.432606 s : WilsonFermion5D TotalTime   /Calls        : 4781.1 us
+Grid : Message : 563.432610 s : WilsonFermion5D CommTime    /Calls        : 3216.26 us
+Grid : Message : 563.432613 s : WilsonFermion5D FaceTime    /Calls        : 613.962 us
+Grid : Message : 563.432616 s : WilsonFermion5D ComputeTime1/Calls        : 6.00683 us
+Grid : Message : 563.432619 s : WilsonFermion5D ComputeTime2/Calls        : 975.369 us
+Grid : Message : 563.432642 s : Average mflops/s per call                : 1.03322e+11
+Grid : Message : 563.432645 s : Average mflops/s per call per rank       : 1.6144e+09
+Grid : Message : 563.432647 s : Average mflops/s per call per node       : 6.45761e+09
+Grid : Message : 563.432649 s : Average mflops/s per call (full)         : 1.50918e+08
+Grid : Message : 563.432652 s : Average mflops/s per call per rank (full): 2.35809e+06
+Grid : Message : 563.432655 s : Average mflops/s per call per node (full): 9.43237e+06
+Grid : Message : 563.432658 s : WilsonFermion5D Stencil
+Grid : Message : 563.432661 s : WilsonFermion5D StencilEven
+Grid : Message : 563.432664 s : WilsonFermion5D StencilOdd
+Grid : Message : 563.432667 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 563.432669 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 563.432671 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 563.507368 s : r_e6.02108
+Grid : Message : 563.514171 s : r_o6.02101
+Grid : Message : 563.520846 s : res12.0421
+Grid : Message : 564.221415 s : norm diff   0
+Grid : Message : 564.968341 s : norm diff even  0
+Grid : Message : 565.377980 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1005
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1005
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 01:55:18 BST 2022
+epoch 1661216118
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1005.64671/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:17:27 BST 2022
+epoch 1661217447
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x0000149b2d1fc000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000149b2d17c000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000149b2cdba000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000149b2c8c8000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000149b2c59e000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000149b2c2bd000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000149b2c05c000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000149b2d103000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000149b2bc7c000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000149b2a520000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000149b2a150000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000149b29eaf000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000149b29d84000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000149b29a02000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000149b297cb000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000149b295b3000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000149b29393000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000149b28fce000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000149b28dca000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000149b2cfd2000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000149b28bc2000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000149b2d037000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000149b2d032000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000149b28ab6000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000149b288ac000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000149b286a8000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/log
@ -0,0 +1,286 @@
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147620000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.489093 s : Grid Layout
+Grid : Message : 1.489098 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.489103 s : 	OpenMP threads       : 4
+Grid : Message : 1.489104 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.528591 s : Making s innermost grids
+Grid : Message : 1.575350 s : Initialising 4d RNG
+Grid : Message : 1.669100 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.669130 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.309927 s : Initialising 5d RNG
+Grid : Message : 4.777262 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.777300 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 37.984452 s : Initialised RNGs
+Grid : Message : 42.778467 s : Drawing gauge field
+Grid : Message : 43.470532 s : Random gauge initialised 
+Grid : Message : 43.486586 s : Setting up Cshift based reference 
+Grid : Message : 72.617045 s : *****************************************************************
+Grid : Message : 72.617078 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.617080 s : *****************************************************************
+Grid : Message : 72.617081 s : *****************************************************************
+Grid : Message : 72.617082 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.617083 s : * Vectorising space-time by 8
+Grid : Message : 72.617084 s : * VComplexF size is 64 B
+Grid : Message : 72.617085 s : * SINGLE precision 
+Grid : Message : 72.617088 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.617089 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.617090 s : *****************************************************************
+Grid : Message : 74.694006 s : Called warmup
+Grid : Message : 361.481577 s : Called Dw 30000 times in 2.86786e+08 us
+Grid : Message : 361.481630 s : mflop/s =   1.48264e+08
+Grid : Message : 361.481632 s : mflop/s per rank =  2.31663e+06
+Grid : Message : 361.481634 s : mflop/s per node =  9.26652e+06
+Grid : Message : 361.481636 s : RF  GiB/s (base 2) =   301270
+Grid : Message : 361.481638 s : mem GiB/s (base 2) =   188294
+Grid : Message : 361.485135 s : norm diff   1.06407e-13
+Grid : Message : 361.534528 s : #### Dhop calls report 
+Grid : Message : 361.534535 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 361.534538 s : WilsonFermion5D TotalTime   /Calls        : 4782.86 us
+Grid : Message : 361.534540 s : WilsonFermion5D CommTime    /Calls        : 3290.85 us
+Grid : Message : 361.534542 s : WilsonFermion5D FaceTime    /Calls        : 486.032 us
+Grid : Message : 361.534544 s : WilsonFermion5D ComputeTime1/Calls        : 4.80859 us
+Grid : Message : 361.534546 s : WilsonFermion5D ComputeTime2/Calls        : 1019.4 us
+Grid : Message : 361.534640 s : Average mflops/s per call                : 1.22988e+11
+Grid : Message : 361.534644 s : Average mflops/s per call per rank       : 1.92169e+09
+Grid : Message : 361.534646 s : Average mflops/s per call per node       : 7.68674e+09
+Grid : Message : 361.534648 s : Average mflops/s per call (full)         : 1.50863e+08
+Grid : Message : 361.534650 s : Average mflops/s per call per rank (full): 2.35723e+06
+Grid : Message : 361.534652 s : Average mflops/s per call per node (full): 9.42891e+06
+Grid : Message : 361.534654 s : WilsonFermion5D Stencil
+Grid : Message : 361.534655 s : WilsonFermion5D StencilEven
+Grid : Message : 361.534656 s : WilsonFermion5D StencilOdd
+Grid : Message : 361.534657 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 361.534658 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 361.534659 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 417.144436 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 417.144464 s : Called DwDag
+Grid : Message : 417.144465 s : norm dag result 12.0421
+Grid : Message : 417.157328 s : norm dag ref    12.0421
+Grid : Message : 417.173632 s : norm dag diff   7.21924e-14
+Grid : Message : 417.219769 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 417.654289 s : src_e0.499998
+Grid : Message : 418.528240 s : src_o0.500002
+Grid : Message : 418.197825 s : *********************************************************
+Grid : Message : 418.197830 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 418.197832 s : * Vectorising space-time by 8
+Grid : Message : 418.197836 s : * SINGLE precision 
+Grid : Message : 418.197839 s : * Using Overlapped Comms/Compute
+Grid : Message : 418.197841 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 418.197843 s : *********************************************************
+Grid : Message : 560.488767 s : Deo mflop/s =   1.49471e+08
+Grid : Message : 560.488803 s : Deo mflop/s per rank   2.33548e+06
+Grid : Message : 560.488805 s : Deo mflop/s per node   9.34194e+06
+Grid : Message : 560.488807 s : #### Dhop calls report 
+Grid : Message : 560.488809 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 560.488811 s : WilsonFermion5D TotalTime   /Calls        : 4742.71 us
+Grid : Message : 560.488813 s : WilsonFermion5D CommTime    /Calls        : 3161.44 us
+Grid : Message : 560.488815 s : WilsonFermion5D FaceTime    /Calls        : 612.972 us
+Grid : Message : 560.488817 s : WilsonFermion5D ComputeTime1/Calls        : 5.88747 us
+Grid : Message : 560.488819 s : WilsonFermion5D ComputeTime2/Calls        : 993.403 us
+Grid : Message : 560.488840 s : Average mflops/s per call                : 1.03167e+11
+Grid : Message : 560.488844 s : Average mflops/s per call per rank       : 1.61199e+09
+Grid : Message : 560.488846 s : Average mflops/s per call per node       : 6.44794e+09
+Grid : Message : 560.488848 s : Average mflops/s per call (full)         : 1.5214e+08
+Grid : Message : 560.488855 s : Average mflops/s per call per rank (full): 2.37718e+06
+Grid : Message : 560.488860 s : Average mflops/s per call per node (full): 9.50872e+06
+Grid : Message : 560.488863 s : WilsonFermion5D Stencil
+Grid : Message : 560.488865 s : WilsonFermion5D StencilEven
+Grid : Message : 560.488868 s : WilsonFermion5D StencilOdd
+Grid : Message : 560.488873 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 560.488876 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 560.488878 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 560.561334 s : r_e6.02108
+Grid : Message : 560.569153 s : r_o6.02101
+Grid : Message : 560.575803 s : res12.0421
+Grid : Message : 561.377555 s : norm diff   0
+Grid : Message : 562.683230 s : norm diff even  0
+Grid : Message : 562.467576 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1020
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1020
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:07:55 BST 2022
+epoch 1661216875
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1020.64675/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:30:01 BST 2022
+epoch 1661218201
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffd49dba000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001478590e3000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000147858d1b000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000147858829000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001478584ff000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014785821e000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000147857fbd000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014785906a000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000147857bdd000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000147856481000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001478560b1000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000147855e10000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000147855ce5000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000147855963000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014785572c000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000147855514000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001478552f4000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000147854f2f000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000147854d2b000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000147858f33000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000147854b23000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000147858f9e000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000147858f99000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000147854a17000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014785480d000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000147854609000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 2 device 0 bus id: 0000:84:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e320000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.519784 s : Grid Layout
+Grid : Message : 1.519788 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.519795 s : 	OpenMP threads       : 4
+Grid : Message : 1.519797 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.559178 s : Making s innermost grids
+Grid : Message : 1.633619 s : Initialising 4d RNG
+Grid : Message : 1.730190 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.730225 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.601962 s : Initialising 5d RNG
+Grid : Message : 5.128210 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 5.128840 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 38.200978 s : Initialised RNGs
+Grid : Message : 42.335127 s : Drawing gauge field
+Grid : Message : 43.156005 s : Random gauge initialised 
+Grid : Message : 43.178252 s : Setting up Cshift based reference 
+Grid : Message : 72.129059 s : *****************************************************************
+Grid : Message : 72.129086 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.129088 s : *****************************************************************
+Grid : Message : 72.129089 s : *****************************************************************
+Grid : Message : 72.129090 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.129091 s : * Vectorising space-time by 8
+Grid : Message : 72.129092 s : * VComplexF size is 64 B
+Grid : Message : 72.129093 s : * SINGLE precision 
+Grid : Message : 72.129096 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.129097 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.129098 s : *****************************************************************
+Grid : Message : 74.239965 s : Called warmup
+Grid : Message : 358.784280 s : Called Dw 30000 times in 2.84544e+08 us
+Grid : Message : 358.784334 s : mflop/s =   1.49433e+08
+Grid : Message : 358.784336 s : mflop/s per rank =  2.33489e+06
+Grid : Message : 358.784338 s : mflop/s per node =  9.33955e+06
+Grid : Message : 358.784340 s : RF  GiB/s (base 2) =   303644
+Grid : Message : 358.784342 s : mem GiB/s (base 2) =   189778
+Grid : Message : 358.787842 s : norm diff   1.06407e-13
+Grid : Message : 358.838249 s : #### Dhop calls report 
+Grid : Message : 358.838256 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 358.838260 s : WilsonFermion5D TotalTime   /Calls        : 4746.18 us
+Grid : Message : 358.838262 s : WilsonFermion5D CommTime    /Calls        : 3270.98 us
+Grid : Message : 358.838264 s : WilsonFermion5D FaceTime    /Calls        : 483.168 us
+Grid : Message : 358.838266 s : WilsonFermion5D ComputeTime1/Calls        : 4.63241 us
+Grid : Message : 358.838268 s : WilsonFermion5D ComputeTime2/Calls        : 1005.55 us
+Grid : Message : 358.838374 s : Average mflops/s per call                : 1.24606e+11
+Grid : Message : 358.838377 s : Average mflops/s per call per rank       : 1.94697e+09
+Grid : Message : 358.838379 s : Average mflops/s per call per node       : 7.78786e+09
+Grid : Message : 358.838381 s : Average mflops/s per call (full)         : 1.52028e+08
+Grid : Message : 358.838388 s : Average mflops/s per call per rank (full): 2.37544e+06
+Grid : Message : 358.838391 s : Average mflops/s per call per node (full): 9.50177e+06
+Grid : Message : 358.838394 s : WilsonFermion5D Stencil
+Grid : Message : 358.838396 s : WilsonFermion5D StencilEven
+Grid : Message : 358.838399 s : WilsonFermion5D StencilOdd
+Grid : Message : 358.838401 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 358.838404 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 358.838405 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 414.333761 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 414.333785 s : Called DwDag
+Grid : Message : 414.333786 s : norm dag result 12.0421
+Grid : Message : 414.361728 s : norm dag ref    12.0421
+Grid : Message : 414.377939 s : norm dag diff   7.21924e-14
+Grid : Message : 414.419027 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 414.775952 s : src_e0.499998
+Grid : Message : 415.196810 s : src_o0.500002
+Grid : Message : 415.306157 s : *********************************************************
+Grid : Message : 415.306160 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 415.306161 s : * Vectorising space-time by 8
+Grid : Message : 415.306162 s : * SINGLE precision 
+Grid : Message : 415.306163 s : * Using Overlapped Comms/Compute
+Grid : Message : 415.306164 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 415.306165 s : *********************************************************
+Grid : Message : 556.694065 s : Deo mflop/s =   1.50429e+08
+Grid : Message : 556.694096 s : Deo mflop/s per rank   2.35046e+06
+Grid : Message : 556.694098 s : Deo mflop/s per node   9.40183e+06
+Grid : Message : 556.694101 s : #### Dhop calls report 
+Grid : Message : 556.694103 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 556.694105 s : WilsonFermion5D TotalTime   /Calls        : 4712.63 us
+Grid : Message : 556.694107 s : WilsonFermion5D CommTime    /Calls        : 3147.17 us
+Grid : Message : 556.694109 s : WilsonFermion5D FaceTime    /Calls        : 604.892 us
+Grid : Message : 556.694111 s : WilsonFermion5D ComputeTime1/Calls        : 5.86504 us
+Grid : Message : 556.694113 s : WilsonFermion5D ComputeTime2/Calls        : 986.801 us
+Grid : Message : 556.694138 s : Average mflops/s per call                : 1.02948e+11
+Grid : Message : 556.694141 s : Average mflops/s per call per rank       : 1.60857e+09
+Grid : Message : 556.694143 s : Average mflops/s per call per node       : 6.43427e+09
+Grid : Message : 556.694145 s : Average mflops/s per call (full)         : 1.53111e+08
+Grid : Message : 556.694147 s : Average mflops/s per call per rank (full): 2.39236e+06
+Grid : Message : 556.694151 s : Average mflops/s per call per node (full): 9.56943e+06
+Grid : Message : 556.694154 s : WilsonFermion5D Stencil
+Grid : Message : 556.694155 s : WilsonFermion5D StencilEven
+Grid : Message : 556.694157 s : WilsonFermion5D StencilOdd
+Grid : Message : 556.694158 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 556.694159 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 556.694160 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 556.767541 s : r_e6.02108
+Grid : Message : 556.776129 s : r_o6.02101
+Grid : Message : 556.782720 s : res12.0421
+Grid : Message : 557.406532 s : norm diff   0
+Grid : Message : 558.192507 s : norm diff even  0
+Grid : Message : 558.691926 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1035
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1035
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:20:32 BST 2022
+epoch 1661217632
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1035.64679/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:42:29 BST 2022
+epoch 1661218949
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe50742000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014bad727e000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014bad6eb6000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014bad69c4000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014bad669a000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014bad63b9000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014bad6158000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014bad7205000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014bad5d78000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014bad461c000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014bad424c000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014bad3fab000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014bad3e80000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014bad3afe000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014bad38c7000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014bad36af000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014bad348f000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014bad30ca000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014bad2ec6000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014bad70ce000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014bad2cbe000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014bad7139000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014bad7134000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014bad2bb2000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014bad29a8000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014bad27a4000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 1 device 0 bus id: 0000:44:00.0
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1508a0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.547283 s : Grid Layout
+Grid : Message : 1.547287 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.547292 s : 	OpenMP threads       : 4
+Grid : Message : 1.547294 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.587435 s : Making s innermost grids
+Grid : Message : 1.643389 s : Initialising 4d RNG
+Grid : Message : 1.736475 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.736504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.526644 s : Initialising 5d RNG
+Grid : Message : 5.205300 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 5.211500 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 38.189754 s : Initialised RNGs
+Grid : Message : 42.704030 s : Drawing gauge field
+Grid : Message : 43.504353 s : Random gauge initialised 
+Grid : Message : 43.515972 s : Setting up Cshift based reference 
+Grid : Message : 72.524541 s : *****************************************************************
+Grid : Message : 72.524570 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.524572 s : *****************************************************************
+Grid : Message : 72.524579 s : *****************************************************************
+Grid : Message : 72.524581 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.524584 s : * Vectorising space-time by 8
+Grid : Message : 72.524587 s : * VComplexF size is 64 B
+Grid : Message : 72.524589 s : * SINGLE precision 
+Grid : Message : 72.524592 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.524593 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.524595 s : *****************************************************************
+Grid : Message : 74.535913 s : Called warmup
+Grid : Message : 358.201124 s : Called Dw 30000 times in 2.83665e+08 us
+Grid : Message : 358.201171 s : mflop/s =   1.49896e+08
+Grid : Message : 358.201174 s : mflop/s per rank =  2.34212e+06
+Grid : Message : 358.201180 s : mflop/s per node =  9.36848e+06
+Grid : Message : 358.201183 s : RF  GiB/s (base 2) =   304585
+Grid : Message : 358.201186 s : mem GiB/s (base 2) =   190365
+Grid : Message : 358.204691 s : norm diff   1.06407e-13
+Grid : Message : 358.255404 s : #### Dhop calls report 
+Grid : Message : 358.255412 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 358.255417 s : WilsonFermion5D TotalTime   /Calls        : 4731.41 us
+Grid : Message : 358.255421 s : WilsonFermion5D CommTime    /Calls        : 3246.57 us
+Grid : Message : 358.255424 s : WilsonFermion5D FaceTime    /Calls        : 480.383 us
+Grid : Message : 358.255427 s : WilsonFermion5D ComputeTime1/Calls        : 4.99283 us
+Grid : Message : 358.255431 s : WilsonFermion5D ComputeTime2/Calls        : 1017.68 us
+Grid : Message : 358.255489 s : Average mflops/s per call                : 1.25152e+11
+Grid : Message : 358.255493 s : Average mflops/s per call per rank       : 1.9555e+09
+Grid : Message : 358.255495 s : Average mflops/s per call per node       : 7.82198e+09
+Grid : Message : 358.255498 s : Average mflops/s per call (full)         : 1.52503e+08
+Grid : Message : 358.255501 s : Average mflops/s per call per rank (full): 2.38286e+06
+Grid : Message : 358.255504 s : Average mflops/s per call per node (full): 9.53145e+06
+Grid : Message : 358.255508 s : WilsonFermion5D Stencil
+Grid : Message : 358.255510 s : WilsonFermion5D StencilEven
+Grid : Message : 358.255513 s : WilsonFermion5D StencilOdd
+Grid : Message : 358.255516 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 358.255518 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 358.255521 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 413.645728 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 413.645754 s : Called DwDag
+Grid : Message : 413.645755 s : norm dag result 12.0421
+Grid : Message : 413.669832 s : norm dag ref    12.0421
+Grid : Message : 413.685970 s : norm dag diff   7.21924e-14
+Grid : Message : 413.728176 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 414.102420 s : src_e0.499998
+Grid : Message : 414.504152 s : src_o0.500002
+Grid : Message : 414.602493 s : *********************************************************
+Grid : Message : 414.602496 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 414.602497 s : * Vectorising space-time by 8
+Grid : Message : 414.602498 s : * SINGLE precision 
+Grid : Message : 414.602499 s : * Using Overlapped Comms/Compute
+Grid : Message : 414.602501 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 414.602502 s : *********************************************************
+Grid : Message : 555.126889 s : Deo mflop/s =   1.51345e+08
+Grid : Message : 555.126928 s : Deo mflop/s per rank   2.36476e+06
+Grid : Message : 555.126930 s : Deo mflop/s per node   9.45904e+06
+Grid : Message : 555.126933 s : #### Dhop calls report 
+Grid : Message : 555.126935 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 555.126937 s : WilsonFermion5D TotalTime   /Calls        : 4683.8 us
+Grid : Message : 555.126939 s : WilsonFermion5D CommTime    /Calls        : 3108.43 us
+Grid : Message : 555.126941 s : WilsonFermion5D FaceTime    /Calls        : 597.314 us
+Grid : Message : 555.126943 s : WilsonFermion5D ComputeTime1/Calls        : 5.94274 us
+Grid : Message : 555.126945 s : WilsonFermion5D ComputeTime2/Calls        : 1003.24 us
+Grid : Message : 555.126973 s : Average mflops/s per call                : 1.04341e+11
+Grid : Message : 555.126978 s : Average mflops/s per call per rank       : 1.63033e+09
+Grid : Message : 555.126984 s : Average mflops/s per call per node       : 6.52133e+09
+Grid : Message : 555.126989 s : Average mflops/s per call (full)         : 1.54053e+08
+Grid : Message : 555.126991 s : Average mflops/s per call per rank (full): 2.40708e+06
+Grid : Message : 555.126996 s : Average mflops/s per call per node (full): 9.62833e+06
+Grid : Message : 555.126998 s : WilsonFermion5D Stencil
+Grid : Message : 555.127001 s : WilsonFermion5D StencilEven
+Grid : Message : 555.127003 s : WilsonFermion5D StencilOdd
+Grid : Message : 555.127006 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 555.127008 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 555.127011 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 555.200537 s : r_e6.02108
+Grid : Message : 555.207490 s : r_o6.02101
+Grid : Message : 555.214018 s : res12.0421
+Grid : Message : 555.847686 s : norm diff   0
+Grid : Message : 556.597525 s : norm diff even  0
+Grid : Message : 557.711160 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1050
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1050
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:33:02 BST 2022
+epoch 1661218382
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1050.64685/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:54:59 BST 2022
+epoch 1661219699
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffec9b2e000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470ad4fc000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470ad134000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470acc42000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470ac918000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470ac637000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470ac3d6000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470ad483000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470abff6000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470aa89a000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470aa4ca000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470aa229000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470aa0fe000)
+	libm.so.6 => /lib64/libm.so.6 (0x00001470a9d7c000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470a9b45000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470a992d000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470a970d000)
+	libc.so.6 => /lib64/libc.so.6 (0x00001470a9348000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x00001470a9144000)
+	/lib64/ld-linux-x86-64.so.2 (0x00001470ad34c000)
+	librt.so.1 => /lib64/librt.so.1 (0x00001470a8f3c000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470ad3b7000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470ad3b2000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470a8e30000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470a8c26000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x00001470a8a22000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14eca0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.674336 s : Grid Layout
+Grid : Message : 1.674340 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.674346 s : 	OpenMP threads       : 4
+Grid : Message : 1.674349 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.716560 s : Making s innermost grids
+Grid : Message : 1.771902 s : Initialising 4d RNG
+Grid : Message : 1.868575 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.868615 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.637285 s : Initialising 5d RNG
+Grid : Message : 5.332800 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 5.333390 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 39.118671 s : Initialised RNGs
+Grid : Message : 42.869064 s : Drawing gauge field
+Grid : Message : 43.712953 s : Random gauge initialised 
+Grid : Message : 43.724865 s : Setting up Cshift based reference 
+Grid : Message : 72.822608 s : *****************************************************************
+Grid : Message : 72.822634 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.822636 s : *****************************************************************
+Grid : Message : 72.822637 s : *****************************************************************
+Grid : Message : 72.822638 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.822639 s : * Vectorising space-time by 8
+Grid : Message : 72.822640 s : * VComplexF size is 64 B
+Grid : Message : 72.822641 s : * SINGLE precision 
+Grid : Message : 72.822644 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.822645 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.822646 s : *****************************************************************
+Grid : Message : 74.927617 s : Called warmup
+Grid : Message : 359.291971 s : Called Dw 30000 times in 2.84363e+08 us
+Grid : Message : 359.292022 s : mflop/s =   1.49528e+08
+Grid : Message : 359.292025 s : mflop/s per rank =  2.33637e+06
+Grid : Message : 359.292032 s : mflop/s per node =  9.34548e+06
+Grid : Message : 359.292035 s : RF  GiB/s (base 2) =   303837
+Grid : Message : 359.292038 s : mem GiB/s (base 2) =   189898
+Grid : Message : 359.295542 s : norm diff   1.06407e-13
+Grid : Message : 359.345144 s : #### Dhop calls report 
+Grid : Message : 359.345152 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 359.345158 s : WilsonFermion5D TotalTime   /Calls        : 4743.11 us
+Grid : Message : 359.345162 s : WilsonFermion5D CommTime    /Calls        : 3276.42 us
+Grid : Message : 359.345165 s : WilsonFermion5D FaceTime    /Calls        : 480.328 us
+Grid : Message : 359.345167 s : WilsonFermion5D ComputeTime1/Calls        : 4.57995 us
+Grid : Message : 359.345169 s : WilsonFermion5D ComputeTime2/Calls        : 999.257 us
+Grid : Message : 359.345237 s : Average mflops/s per call                : 1.2427e+11
+Grid : Message : 359.345241 s : Average mflops/s per call per rank       : 1.94172e+09
+Grid : Message : 359.345244 s : Average mflops/s per call per node       : 7.7669e+09
+Grid : Message : 359.345247 s : Average mflops/s per call (full)         : 1.52127e+08
+Grid : Message : 359.345251 s : Average mflops/s per call per rank (full): 2.37698e+06
+Grid : Message : 359.345253 s : Average mflops/s per call per node (full): 9.50794e+06
+Grid : Message : 359.345256 s : WilsonFermion5D Stencil
+Grid : Message : 359.345258 s : WilsonFermion5D StencilEven
+Grid : Message : 359.345260 s : WilsonFermion5D StencilOdd
+Grid : Message : 359.345262 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 359.345264 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 359.345266 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 414.850960 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 414.850985 s : Called DwDag
+Grid : Message : 414.850986 s : norm dag result 12.0421
+Grid : Message : 414.876032 s : norm dag ref    12.0421
+Grid : Message : 414.892131 s : norm dag diff   7.21924e-14
+Grid : Message : 414.932833 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 415.336415 s : src_e0.499998
+Grid : Message : 415.784915 s : src_o0.500002
+Grid : Message : 415.911800 s : *********************************************************
+Grid : Message : 415.911803 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 415.911804 s : * Vectorising space-time by 8
+Grid : Message : 415.911805 s : * SINGLE precision 
+Grid : Message : 415.911806 s : * Using Overlapped Comms/Compute
+Grid : Message : 415.911807 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 415.911808 s : *********************************************************
+Grid : Message : 556.278711 s : Deo mflop/s =   1.51519e+08
+Grid : Message : 556.278742 s : Deo mflop/s per rank   2.36748e+06
+Grid : Message : 556.278745 s : Deo mflop/s per node   9.46991e+06
+Grid : Message : 556.278751 s : #### Dhop calls report 
+Grid : Message : 556.278753 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 556.278757 s : WilsonFermion5D TotalTime   /Calls        : 4678.6 us
+Grid : Message : 556.278761 s : WilsonFermion5D CommTime    /Calls        : 3123.43 us
+Grid : Message : 556.278764 s : WilsonFermion5D FaceTime    /Calls        : 594.933 us
+Grid : Message : 556.278768 s : WilsonFermion5D ComputeTime1/Calls        : 6.01337 us
+Grid : Message : 556.278771 s : WilsonFermion5D ComputeTime2/Calls        : 985.509 us
+Grid : Message : 556.278793 s : Average mflops/s per call                : 1.02827e+11
+Grid : Message : 556.278797 s : Average mflops/s per call per rank       : 1.60668e+09
+Grid : Message : 556.278799 s : Average mflops/s per call per node       : 6.42671e+09
+Grid : Message : 556.278801 s : Average mflops/s per call (full)         : 1.54225e+08
+Grid : Message : 556.278805 s : Average mflops/s per call per rank (full): 2.40976e+06
+Grid : Message : 556.278809 s : Average mflops/s per call per node (full): 9.63904e+06
+Grid : Message : 556.278812 s : WilsonFermion5D Stencil
+Grid : Message : 556.278815 s : WilsonFermion5D StencilEven
+Grid : Message : 556.278817 s : WilsonFermion5D StencilOdd
+Grid : Message : 556.278820 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 556.278823 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 556.278825 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 556.353644 s : r_e6.02108
+Grid : Message : 556.361051 s : r_o6.02101
+Grid : Message : 556.367550 s : res12.0421
+Grid : Message : 557.632190 s : norm diff   0
+Grid : Message : 557.851410 s : norm diff even  0
+Grid : Message : 558.237043 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1065
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1065
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:45:31 BST 2022
+epoch 1661219131
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1065.64693/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:07:25 BST 2022
+epoch 1661220445
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe74893000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014d458e71000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014d458aa9000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014d4585b7000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014d45828d000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014d457fac000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014d457d4b000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014d458df8000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014d45796b000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014d45620f000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014d455e3f000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014d455b9e000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014d455a73000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014d4556f1000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014d4554ba000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014d4552a2000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014d455082000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014d454cbd000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014d454ab9000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014d458cc1000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014d4548b1000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014d458d2c000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014d458d27000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014d4547a5000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014d45459b000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014d454397000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e280000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.460224 s : Grid Layout
+Grid : Message : 1.460230 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.460237 s : 	OpenMP threads       : 4
+Grid : Message : 1.460239 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.501038 s : Making s innermost grids
+Grid : Message : 1.548447 s : Initialising 4d RNG
+Grid : Message : 1.641350 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.641379 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.415745 s : Initialising 5d RNG
+Grid : Message : 4.858093 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.858127 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 38.377226 s : Initialised RNGs
+Grid : Message : 42.241334 s : Drawing gauge field
+Grid : Message : 43.676130 s : Random gauge initialised 
+Grid : Message : 43.793380 s : Setting up Cshift based reference 
+Grid : Message : 72.567410 s : *****************************************************************
+Grid : Message : 72.567680 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.567700 s : *****************************************************************
+Grid : Message : 72.567710 s : *****************************************************************
+Grid : Message : 72.567720 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.567730 s : * Vectorising space-time by 8
+Grid : Message : 72.567740 s : * VComplexF size is 64 B
+Grid : Message : 72.567750 s : * SINGLE precision 
+Grid : Message : 72.567780 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.567790 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.567800 s : *****************************************************************
+Grid : Message : 74.181282 s : Called warmup
+Grid : Message : 356.575277 s : Called Dw 30000 times in 2.82393e+08 us
+Grid : Message : 356.575346 s : mflop/s =   1.50571e+08
+Grid : Message : 356.575348 s : mflop/s per rank =  2.35267e+06
+Grid : Message : 356.575350 s : mflop/s per node =  9.41067e+06
+Grid : Message : 356.575352 s : RF  GiB/s (base 2) =   305956
+Grid : Message : 356.575354 s : mem GiB/s (base 2) =   191223
+Grid : Message : 356.578856 s : norm diff   1.06407e-13
+Grid : Message : 356.627867 s : #### Dhop calls report 
+Grid : Message : 356.627875 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 356.627879 s : WilsonFermion5D TotalTime   /Calls        : 4710.82 us
+Grid : Message : 356.627881 s : WilsonFermion5D CommTime    /Calls        : 3226.37 us
+Grid : Message : 356.627883 s : WilsonFermion5D FaceTime    /Calls        : 477.831 us
+Grid : Message : 356.627885 s : WilsonFermion5D ComputeTime1/Calls        : 5.85034 us
+Grid : Message : 356.627887 s : WilsonFermion5D ComputeTime2/Calls        : 1019.91 us
+Grid : Message : 356.627914 s : Average mflops/s per call                : 1.229e+11
+Grid : Message : 356.627918 s : Average mflops/s per call per rank       : 1.92031e+09
+Grid : Message : 356.627920 s : Average mflops/s per call per node       : 7.68122e+09
+Grid : Message : 356.627922 s : Average mflops/s per call (full)         : 1.5317e+08
+Grid : Message : 356.627927 s : Average mflops/s per call per rank (full): 2.39328e+06
+Grid : Message : 356.627930 s : Average mflops/s per call per node (full): 9.5731e+06
+Grid : Message : 356.627932 s : WilsonFermion5D Stencil
+Grid : Message : 356.627934 s : WilsonFermion5D StencilEven
+Grid : Message : 356.627935 s : WilsonFermion5D StencilOdd
+Grid : Message : 356.627936 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 356.627938 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 356.627940 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 411.993212 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 411.993238 s : Called DwDag
+Grid : Message : 411.993240 s : norm dag result 12.0421
+Grid : Message : 412.893700 s : norm dag ref    12.0421
+Grid : Message : 412.249000 s : norm dag diff   7.21924e-14
+Grid : Message : 412.655740 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 412.459794 s : src_e0.499998
+Grid : Message : 412.869558 s : src_o0.500002
+Grid : Message : 412.998877 s : *********************************************************
+Grid : Message : 412.998881 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 412.998883 s : * Vectorising space-time by 8
+Grid : Message : 412.998887 s : * SINGLE precision 
+Grid : Message : 412.998889 s : * Using Overlapped Comms/Compute
+Grid : Message : 412.998892 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 412.998894 s : *********************************************************
+Grid : Message : 552.599242 s : Deo mflop/s =   1.52349e+08
+Grid : Message : 552.599275 s : Deo mflop/s per rank   2.38045e+06
+Grid : Message : 552.599277 s : Deo mflop/s per node   9.52178e+06
+Grid : Message : 552.599280 s : #### Dhop calls report 
+Grid : Message : 552.599282 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 552.599284 s : WilsonFermion5D TotalTime   /Calls        : 4653.08 us
+Grid : Message : 552.599286 s : WilsonFermion5D CommTime    /Calls        : 3098.34 us
+Grid : Message : 552.599288 s : WilsonFermion5D FaceTime    /Calls        : 594.442 us
+Grid : Message : 552.599290 s : WilsonFermion5D ComputeTime1/Calls        : 5.8861 us
+Grid : Message : 552.599292 s : WilsonFermion5D ComputeTime2/Calls        : 985.791 us
+Grid : Message : 552.599320 s : Average mflops/s per call                : 1.0387e+11
+Grid : Message : 552.599324 s : Average mflops/s per call per rank       : 1.62297e+09
+Grid : Message : 552.599327 s : Average mflops/s per call per node       : 6.49188e+09
+Grid : Message : 552.599330 s : Average mflops/s per call (full)         : 1.5507e+08
+Grid : Message : 552.599333 s : Average mflops/s per call per rank (full): 2.42297e+06
+Grid : Message : 552.599336 s : Average mflops/s per call per node (full): 9.69189e+06
+Grid : Message : 552.599340 s : WilsonFermion5D Stencil
+Grid : Message : 552.599341 s : WilsonFermion5D StencilEven
+Grid : Message : 552.599344 s : WilsonFermion5D StencilOdd
+Grid : Message : 552.599345 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 552.599347 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 552.599350 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 552.670528 s : r_e6.02108
+Grid : Message : 552.680935 s : r_o6.02101
+Grid : Message : 552.687380 s : res12.0421
+Grid : Message : 553.325113 s : norm diff   0
+Grid : Message : 554.100564 s : norm diff even  0
+Grid : Message : 554.561181 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1080
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1080
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:58:01 BST 2022
+epoch 1661219881
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1080.64697/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:19:51 BST 2022
+epoch 1661221191
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffcebffa000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014f17865e000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014f178296000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014f177da4000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014f177a7a000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014f177799000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014f177538000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014f1785e5000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014f177158000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014f1759fc000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014f17562c000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014f17538b000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014f175260000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014f174ede000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014f174ca7000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014f174a8f000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014f17486f000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014f1744aa000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014f1742a6000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014f1784ae000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014f17409e000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014f178519000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014f178514000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014f173f92000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014f173d88000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014f173b84000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/log
@ -0,0 +1,286 @@
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x148ce0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.503426 s : Grid Layout
+Grid : Message : 1.503430 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.503436 s : 	OpenMP threads       : 4
+Grid : Message : 1.503438 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.545288 s : Making s innermost grids
+Grid : Message : 1.600811 s : Initialising 4d RNG
+Grid : Message : 1.693389 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.693424 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.322964 s : Initialising 5d RNG
+Grid : Message : 4.759541 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.759582 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 37.877117 s : Initialised RNGs
+Grid : Message : 42.920224 s : Drawing gauge field
+Grid : Message : 43.530915 s : Random gauge initialised 
+Grid : Message : 43.542723 s : Setting up Cshift based reference 
+Grid : Message : 72.809099 s : *****************************************************************
+Grid : Message : 72.809129 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.809131 s : *****************************************************************
+Grid : Message : 72.809132 s : *****************************************************************
+Grid : Message : 72.809133 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.809134 s : * Vectorising space-time by 8
+Grid : Message : 72.809135 s : * VComplexF size is 64 B
+Grid : Message : 72.809136 s : * SINGLE precision 
+Grid : Message : 72.809139 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.809140 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.809141 s : *****************************************************************
+Grid : Message : 74.811618 s : Called warmup
+Grid : Message : 356.437280 s : Called Dw 30000 times in 2.81231e+08 us
+Grid : Message : 356.437910 s : mflop/s =   1.51193e+08
+Grid : Message : 356.437940 s : mflop/s per rank =  2.36239e+06
+Grid : Message : 356.437990 s : mflop/s per node =  9.44956e+06
+Grid : Message : 356.438020 s : RF  GiB/s (base 2) =   307220
+Grid : Message : 356.438050 s : mem GiB/s (base 2) =   192013
+Grid : Message : 356.473120 s : norm diff   1.06407e-13
+Grid : Message : 356.964660 s : #### Dhop calls report 
+Grid : Message : 356.964730 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 356.964770 s : WilsonFermion5D TotalTime   /Calls        : 4690.91 us
+Grid : Message : 356.964790 s : WilsonFermion5D CommTime    /Calls        : 3217.42 us
+Grid : Message : 356.964810 s : WilsonFermion5D FaceTime    /Calls        : 478.163 us
+Grid : Message : 356.964830 s : WilsonFermion5D ComputeTime1/Calls        : 4.80376 us
+Grid : Message : 356.964850 s : WilsonFermion5D ComputeTime2/Calls        : 1009.04 us
+Grid : Message : 356.965410 s : Average mflops/s per call                : 1.24517e+11
+Grid : Message : 356.965450 s : Average mflops/s per call per rank       : 1.94558e+09
+Grid : Message : 356.965470 s : Average mflops/s per call per node       : 7.78233e+09
+Grid : Message : 356.965490 s : Average mflops/s per call (full)         : 1.5382e+08
+Grid : Message : 356.965510 s : Average mflops/s per call per rank (full): 2.40343e+06
+Grid : Message : 356.965530 s : Average mflops/s per call per node (full): 9.61373e+06
+Grid : Message : 356.965550 s : WilsonFermion5D Stencil
+Grid : Message : 356.965560 s : WilsonFermion5D StencilEven
+Grid : Message : 356.965570 s : WilsonFermion5D StencilOdd
+Grid : Message : 356.965580 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 356.965590 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 356.965600 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 411.545363 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 411.545389 s : Called DwDag
+Grid : Message : 411.545390 s : norm dag result 12.0421
+Grid : Message : 411.583980 s : norm dag ref    12.0421
+Grid : Message : 411.599853 s : norm dag diff   7.21924e-14
+Grid : Message : 411.641431 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 412.849100 s : src_e0.499998
+Grid : Message : 412.456953 s : src_o0.500002
+Grid : Message : 412.561709 s : *********************************************************
+Grid : Message : 412.561712 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 412.561713 s : * Vectorising space-time by 8
+Grid : Message : 412.561714 s : * SINGLE precision 
+Grid : Message : 412.561715 s : * Using Overlapped Comms/Compute
+Grid : Message : 412.561716 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 412.561717 s : *********************************************************
+Grid : Message : 551.425534 s : Deo mflop/s =   1.53159e+08
+Grid : Message : 551.425571 s : Deo mflop/s per rank   2.39311e+06
+Grid : Message : 551.425573 s : Deo mflop/s per node   9.57245e+06
+Grid : Message : 551.425576 s : #### Dhop calls report 
+Grid : Message : 551.425578 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 551.425580 s : WilsonFermion5D TotalTime   /Calls        : 4628.49 us
+Grid : Message : 551.425582 s : WilsonFermion5D CommTime    /Calls        : 3073.6 us
+Grid : Message : 551.425584 s : WilsonFermion5D FaceTime    /Calls        : 589.193 us
+Grid : Message : 551.425586 s : WilsonFermion5D ComputeTime1/Calls        : 5.908 us
+Grid : Message : 551.425588 s : WilsonFermion5D ComputeTime2/Calls        : 991.151 us
+Grid : Message : 551.425610 s : Average mflops/s per call                : 1.04262e+11
+Grid : Message : 551.425614 s : Average mflops/s per call per rank       : 1.6291e+09
+Grid : Message : 551.425616 s : Average mflops/s per call per node       : 6.5164e+09
+Grid : Message : 551.425618 s : Average mflops/s per call (full)         : 1.55894e+08
+Grid : Message : 551.425625 s : Average mflops/s per call per rank (full): 2.43585e+06
+Grid : Message : 551.425630 s : Average mflops/s per call per node (full): 9.74338e+06
+Grid : Message : 551.425633 s : WilsonFermion5D Stencil
+Grid : Message : 551.425635 s : WilsonFermion5D StencilEven
+Grid : Message : 551.425638 s : WilsonFermion5D StencilOdd
+Grid : Message : 551.425639 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 551.425641 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 551.425643 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 551.498308 s : r_e6.02108
+Grid : Message : 551.506233 s : r_o6.02101
+Grid : Message : 551.512628 s : res12.0421
+Grid : Message : 552.147704 s : norm diff   0
+Grid : Message : 553.522450 s : norm diff even  0
+Grid : Message : 553.479623 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1095
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1095
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:10:28 BST 2022
+epoch 1661220628
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1095.64702/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:32:16 BST 2022
+epoch 1661221936
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe30aa7000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015253eefc000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015253eb34000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015253e642000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015253e318000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015253e037000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015253ddd6000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015253ee83000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015253d9f6000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000015253c29a000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015253beca000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015253bc29000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015253bafe000)
+	libm.so.6 => /lib64/libm.so.6 (0x000015253b77c000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015253b545000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015253b32d000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000015253b10d000)
+	libc.so.6 => /lib64/libc.so.6 (0x000015253ad48000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000015253ab44000)
+	/lib64/ld-linux-x86-64.so.2 (0x000015253ed4c000)
+	librt.so.1 => /lib64/librt.so.1 (0x000015253a93c000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015253edb7000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015253edb2000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015253a830000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015253a626000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000015253a422000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14fe00000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.604389 s : Grid Layout
+Grid : Message : 1.604394 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.604403 s : 	OpenMP threads       : 4
+Grid : Message : 1.604406 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.643570 s : Making s innermost grids
+Grid : Message : 1.694630 s : Initialising 4d RNG
+Grid : Message : 1.787050 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.787079 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.366084 s : Initialising 5d RNG
+Grid : Message : 4.819386 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.819425 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 38.300660 s : Initialised RNGs
+Grid : Message : 43.255423 s : Drawing gauge field
+Grid : Message : 44.115774 s : Random gauge initialised 
+Grid : Message : 44.127168 s : Setting up Cshift based reference 
+Grid : Message : 72.933387 s : *****************************************************************
+Grid : Message : 72.933414 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.933416 s : *****************************************************************
+Grid : Message : 72.933417 s : *****************************************************************
+Grid : Message : 72.933418 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.933419 s : * Vectorising space-time by 8
+Grid : Message : 72.933420 s : * VComplexF size is 64 B
+Grid : Message : 72.933422 s : * SINGLE precision 
+Grid : Message : 72.933425 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.933426 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.933427 s : *****************************************************************
+Grid : Message : 74.920872 s : Called warmup
+Grid : Message : 355.775111 s : Called Dw 30000 times in 2.80853e+08 us
+Grid : Message : 355.775156 s : mflop/s =   1.51396e+08
+Grid : Message : 355.775158 s : mflop/s per rank =  2.36557e+06
+Grid : Message : 355.775160 s : mflop/s per node =  9.46227e+06
+Grid : Message : 355.775162 s : RF  GiB/s (base 2) =   307634
+Grid : Message : 355.775164 s : mem GiB/s (base 2) =   192271
+Grid : Message : 355.778673 s : norm diff   1.06407e-13
+Grid : Message : 355.827430 s : #### Dhop calls report 
+Grid : Message : 355.827437 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 355.827440 s : WilsonFermion5D TotalTime   /Calls        : 4684.11 us
+Grid : Message : 355.827442 s : WilsonFermion5D CommTime    /Calls        : 3200.33 us
+Grid : Message : 355.827444 s : WilsonFermion5D FaceTime    /Calls        : 479.567 us
+Grid : Message : 355.827446 s : WilsonFermion5D ComputeTime1/Calls        : 4.82936 us
+Grid : Message : 355.827448 s : WilsonFermion5D ComputeTime2/Calls        : 1018.33 us
+Grid : Message : 355.827545 s : Average mflops/s per call                : 1.24896e+11
+Grid : Message : 355.827549 s : Average mflops/s per call per rank       : 1.9515e+09
+Grid : Message : 355.827551 s : Average mflops/s per call per node       : 7.80601e+09
+Grid : Message : 355.827553 s : Average mflops/s per call (full)         : 1.54043e+08
+Grid : Message : 355.827555 s : Average mflops/s per call per rank (full): 2.40692e+06
+Grid : Message : 355.827559 s : Average mflops/s per call per node (full): 9.6277e+06
+Grid : Message : 355.827561 s : WilsonFermion5D Stencil
+Grid : Message : 355.827563 s : WilsonFermion5D StencilEven
+Grid : Message : 355.827564 s : WilsonFermion5D StencilOdd
+Grid : Message : 355.827569 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 355.827571 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 355.827573 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 411.449084 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 411.449109 s : Called DwDag
+Grid : Message : 411.449110 s : norm dag result 12.0421
+Grid : Message : 411.469399 s : norm dag ref    12.0421
+Grid : Message : 411.485218 s : norm dag diff   7.21924e-14
+Grid : Message : 411.525702 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 411.934554 s : src_e0.499998
+Grid : Message : 412.308595 s : src_o0.500002
+Grid : Message : 412.416600 s : *********************************************************
+Grid : Message : 412.416604 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 412.416605 s : * Vectorising space-time by 8
+Grid : Message : 412.416606 s : * SINGLE precision 
+Grid : Message : 412.416607 s : * Using Overlapped Comms/Compute
+Grid : Message : 412.416608 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 412.416609 s : *********************************************************
+Grid : Message : 550.472923 s : Deo mflop/s =   1.54056e+08
+Grid : Message : 550.472960 s : Deo mflop/s per rank   2.40712e+06
+Grid : Message : 550.472962 s : Deo mflop/s per node   9.6285e+06
+Grid : Message : 550.472965 s : #### Dhop calls report 
+Grid : Message : 550.472967 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 550.472969 s : WilsonFermion5D TotalTime   /Calls        : 4601.58 us
+Grid : Message : 550.472972 s : WilsonFermion5D CommTime    /Calls        : 3044.3 us
+Grid : Message : 550.472974 s : WilsonFermion5D FaceTime    /Calls        : 582.772 us
+Grid : Message : 550.472976 s : WilsonFermion5D ComputeTime1/Calls        : 6.03287 us
+Grid : Message : 550.472979 s : WilsonFermion5D ComputeTime2/Calls        : 1000.92 us
+Grid : Message : 550.473006 s : Average mflops/s per call                : 1.03512e+11
+Grid : Message : 550.473012 s : Average mflops/s per call per rank       : 1.61737e+09
+Grid : Message : 550.473015 s : Average mflops/s per call per node       : 6.46949e+09
+Grid : Message : 550.473017 s : Average mflops/s per call (full)         : 1.56806e+08
+Grid : Message : 550.473023 s : Average mflops/s per call per rank (full): 2.45009e+06
+Grid : Message : 550.473026 s : Average mflops/s per call per node (full): 9.80036e+06
+Grid : Message : 550.473030 s : WilsonFermion5D Stencil
+Grid : Message : 550.473033 s : WilsonFermion5D StencilEven
+Grid : Message : 550.473035 s : WilsonFermion5D StencilOdd
+Grid : Message : 550.473038 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 550.473040 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 550.473043 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 550.546695 s : r_e6.02108
+Grid : Message : 550.553572 s : r_o6.02101
+Grid : Message : 550.559924 s : res12.0421
+Grid : Message : 551.230295 s : norm diff   0
+Grid : Message : 552.839070 s : norm diff even  0
+Grid : Message : 552.554967 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1110
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1110
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:22:53 BST 2022
+epoch 1661221373
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1110.64706/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:44:38 BST 2022
+epoch 1661222678
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffd3170b000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ac04bcc000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ac04804000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ac04312000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ac03fe8000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ac03d07000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ac03aa6000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ac04b53000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ac036c6000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ac01f6a000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ac01b9a000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ac018f9000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ac017ce000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014ac0144c000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ac01215000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ac00ffd000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ac00ddd000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014ac00a18000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014ac00814000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014ac04a1c000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014ac0060c000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ac04a87000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ac04a82000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ac00500000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ac002f6000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014ac000f2000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/log
@ -0,0 +1,286 @@
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14bc00000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.741388 s : Grid Layout
+Grid : Message : 1.741393 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.741398 s : 	OpenMP threads       : 4
+Grid : Message : 1.741399 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.779973 s : Making s innermost grids
+Grid : Message : 1.841238 s : Initialising 4d RNG
+Grid : Message : 1.936538 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.936565 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.514090 s : Initialising 5d RNG
+Grid : Message : 4.953826 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.953867 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 38.142286 s : Initialised RNGs
+Grid : Message : 42.283705 s : Drawing gauge field
+Grid : Message : 43.103984 s : Random gauge initialised 
+Grid : Message : 43.120000 s : Setting up Cshift based reference 
+Grid : Message : 71.998584 s : *****************************************************************
+Grid : Message : 71.998611 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 71.998613 s : *****************************************************************
+Grid : Message : 71.998614 s : *****************************************************************
+Grid : Message : 71.998615 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 71.998616 s : * Vectorising space-time by 8
+Grid : Message : 71.998617 s : * VComplexF size is 64 B
+Grid : Message : 71.998618 s : * SINGLE precision 
+Grid : Message : 71.998621 s : * Using Overlapped Comms/Compute
+Grid : Message : 71.998622 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 71.998623 s : *****************************************************************
+Grid : Message : 74.860610 s : Called warmup
+Grid : Message : 354.501585 s : Called Dw 30000 times in 2.80414e+08 us
+Grid : Message : 354.501646 s : mflop/s =   1.51633e+08
+Grid : Message : 354.501649 s : mflop/s per rank =  2.36927e+06
+Grid : Message : 354.501651 s : mflop/s per node =  9.47708e+06
+Grid : Message : 354.501653 s : RF  GiB/s (base 2) =   308115
+Grid : Message : 354.501655 s : mem GiB/s (base 2) =   192572
+Grid : Message : 354.505161 s : norm diff   1.06407e-13
+Grid : Message : 354.553410 s : #### Dhop calls report 
+Grid : Message : 354.553418 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 354.553422 s : WilsonFermion5D TotalTime   /Calls        : 4677.53 us
+Grid : Message : 354.553424 s : WilsonFermion5D CommTime    /Calls        : 3194.52 us
+Grid : Message : 354.553426 s : WilsonFermion5D FaceTime    /Calls        : 480.197 us
+Grid : Message : 354.553428 s : WilsonFermion5D ComputeTime1/Calls        : 5.0421 us
+Grid : Message : 354.553430 s : WilsonFermion5D ComputeTime2/Calls        : 1015.76 us
+Grid : Message : 354.553528 s : Average mflops/s per call                : 1.2287e+11
+Grid : Message : 354.553532 s : Average mflops/s per call per rank       : 1.91984e+09
+Grid : Message : 354.553534 s : Average mflops/s per call per node       : 7.67937e+09
+Grid : Message : 354.553536 s : Average mflops/s per call (full)         : 1.5426e+08
+Grid : Message : 354.553538 s : Average mflops/s per call per rank (full): 2.41031e+06
+Grid : Message : 354.553540 s : Average mflops/s per call per node (full): 9.64123e+06
+Grid : Message : 354.553542 s : WilsonFermion5D Stencil
+Grid : Message : 354.553543 s : WilsonFermion5D StencilEven
+Grid : Message : 354.553544 s : WilsonFermion5D StencilOdd
+Grid : Message : 354.553545 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 354.553546 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 354.553547 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 409.963064 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 409.963090 s : Called DwDag
+Grid : Message : 409.963091 s : norm dag result 12.0421
+Grid : Message : 409.997480 s : norm dag ref    12.0421
+Grid : Message : 410.132270 s : norm dag diff   7.21924e-14
+Grid : Message : 410.545350 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 410.417445 s : src_e0.499998
+Grid : Message : 410.871395 s : src_o0.500002
+Grid : Message : 411.107600 s : *********************************************************
+Grid : Message : 411.107650 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 411.107660 s : * Vectorising space-time by 8
+Grid : Message : 411.107670 s : * SINGLE precision 
+Grid : Message : 411.107680 s : * Using Overlapped Comms/Compute
+Grid : Message : 411.107690 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 411.107700 s : *********************************************************
+Grid : Message : 548.471947 s : Deo mflop/s =   1.5473e+08
+Grid : Message : 548.471984 s : Deo mflop/s per rank   2.41766e+06
+Grid : Message : 548.471986 s : Deo mflop/s per node   9.67065e+06
+Grid : Message : 548.471989 s : #### Dhop calls report 
+Grid : Message : 548.471991 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 548.471993 s : WilsonFermion5D TotalTime   /Calls        : 4581.78 us
+Grid : Message : 548.471995 s : WilsonFermion5D CommTime    /Calls        : 3018.83 us
+Grid : Message : 548.471997 s : WilsonFermion5D FaceTime    /Calls        : 580 us
+Grid : Message : 548.471999 s : WilsonFermion5D ComputeTime1/Calls        : 6.02087 us
+Grid : Message : 548.472001 s : WilsonFermion5D ComputeTime2/Calls        : 1008.22 us
+Grid : Message : 548.472021 s : Average mflops/s per call                : 1.02165e+11
+Grid : Message : 548.472025 s : Average mflops/s per call per rank       : 1.59633e+09
+Grid : Message : 548.472027 s : Average mflops/s per call per node       : 6.38532e+09
+Grid : Message : 548.472030 s : Average mflops/s per call (full)         : 1.57483e+08
+Grid : Message : 548.472032 s : Average mflops/s per call per rank (full): 2.46068e+06
+Grid : Message : 548.472035 s : Average mflops/s per call per node (full): 9.84271e+06
+Grid : Message : 548.472037 s : WilsonFermion5D Stencil
+Grid : Message : 548.472038 s : WilsonFermion5D StencilEven
+Grid : Message : 548.472039 s : WilsonFermion5D StencilOdd
+Grid : Message : 548.472041 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 548.472042 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 548.472045 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 548.546943 s : r_e6.02108
+Grid : Message : 548.553846 s : r_o6.02101
+Grid : Message : 548.560197 s : res12.0421
+Grid : Message : 549.240929 s : norm diff   0
+Grid : Message : 550.799580 s : norm diff even  0
+Grid : Message : 550.551633 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1125
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1125
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:35:17 BST 2022
+epoch 1661222117
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1125.64712/success
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/app-hash
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/elf
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/elf
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/end-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:56:59 BST 2022
+epoch 1661223419
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/env
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/env
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/ldd
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe4a10b000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e1f7395000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e1f6fcd000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e1f6adb000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e1f67b1000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e1f64d0000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e1f626f000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e1f731c000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e1f5e8f000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e1f4733000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e1f4363000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e1f40c2000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e1f3f97000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014e1f3c15000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e1f39de000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e1f37c6000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e1f35a6000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014e1f31e1000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014e1f2fdd000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014e1f71e5000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014e1f2dd5000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e1f7250000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e1f724b000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e1f2cc9000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e1f2abf000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014e1f28bb000)
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/log
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/log
@ -0,0 +1,286 @@
+tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
+tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
+tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
+tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
+tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 3 device 0 bus id: 0000:C4:00.0
+SharedMemoryMpi:  World communicator of size 64
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x151da0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.624935 s : Grid Layout
+Grid : Message : 1.624939 s : 	Global lattice size  : 64 64 64 256 
+Grid : Message : 1.624944 s : 	OpenMP threads       : 4
+Grid : Message : 1.624946 s : 	MPI tasks            : 2 2 2 8 
+Grid : Message : 1.665490 s : Making s innermost grids
+Grid : Message : 1.724722 s : Initialising 4d RNG
+Grid : Message : 1.820577 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.820601 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 3.397501 s : Initialising 5d RNG
+Grid : Message : 4.840410 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.840450 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 38.264300 s : Initialised RNGs
+Grid : Message : 42.777074 s : Drawing gauge field
+Grid : Message : 43.619715 s : Random gauge initialised 
+Grid : Message : 43.632921 s : Setting up Cshift based reference 
+Grid : Message : 72.511474 s : *****************************************************************
+Grid : Message : 72.511512 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 72.511514 s : *****************************************************************
+Grid : Message : 72.511515 s : *****************************************************************
+Grid : Message : 72.511516 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 72.511518 s : * Vectorising space-time by 8
+Grid : Message : 72.511520 s : * VComplexF size is 64 B
+Grid : Message : 72.511522 s : * SINGLE precision 
+Grid : Message : 72.511525 s : * Using Overlapped Comms/Compute
+Grid : Message : 72.511529 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 72.511531 s : *****************************************************************
+Grid : Message : 74.532358 s : Called warmup
+Grid : Message : 354.723946 s : Called Dw 30000 times in 2.8019e+08 us
+Grid : Message : 354.723995 s : mflop/s =   1.51755e+08
+Grid : Message : 354.723997 s : mflop/s per rank =  2.37116e+06
+Grid : Message : 354.723999 s : mflop/s per node =  9.48466e+06
+Grid : Message : 354.724001 s : RF  GiB/s (base 2) =   308362
+Grid : Message : 354.724003 s : mem GiB/s (base 2) =   192726
+Grid : Message : 354.727509 s : norm diff   1.06407e-13
+Grid : Message : 354.777181 s : #### Dhop calls report 
+Grid : Message : 354.777188 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 354.777192 s : WilsonFermion5D TotalTime   /Calls        : 4673.52 us
+Grid : Message : 354.777194 s : WilsonFermion5D CommTime    /Calls        : 3186.13 us
+Grid : Message : 354.777196 s : WilsonFermion5D FaceTime    /Calls        : 478.836 us
+Grid : Message : 354.777198 s : WilsonFermion5D ComputeTime1/Calls        : 4.98882 us
+Grid : Message : 354.777200 s : WilsonFermion5D ComputeTime2/Calls        : 1022.19 us
+Grid : Message : 354.777269 s : Average mflops/s per call                : 1.252e+11
+Grid : Message : 354.777272 s : Average mflops/s per call per rank       : 1.95624e+09
+Grid : Message : 354.777274 s : Average mflops/s per call per node       : 7.82497e+09
+Grid : Message : 354.777276 s : Average mflops/s per call (full)         : 1.54392e+08
+Grid : Message : 354.777279 s : Average mflops/s per call per rank (full): 2.41237e+06
+Grid : Message : 354.777281 s : Average mflops/s per call per node (full): 9.6495e+06
+Grid : Message : 354.777283 s : WilsonFermion5D Stencil
+Grid : Message : 354.777284 s : WilsonFermion5D StencilEven
+Grid : Message : 354.777286 s : WilsonFermion5D StencilOdd
+Grid : Message : 354.777287 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 354.777289 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 354.777290 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 410.641840 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 410.642230 s : Called DwDag
+Grid : Message : 410.642250 s : norm dag result 12.0421
+Grid : Message : 410.776270 s : norm dag ref    12.0421
+Grid : Message : 410.933470 s : norm dag diff   7.21924e-14
+Grid : Message : 410.141942 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 410.561423 s : src_e0.499998
+Grid : Message : 410.986491 s : src_o0.500002
+Grid : Message : 411.130524 s : *********************************************************
+Grid : Message : 411.130528 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 411.130530 s : * Vectorising space-time by 8
+Grid : Message : 411.130532 s : * SINGLE precision 
+Grid : Message : 411.130534 s : * Using Overlapped Comms/Compute
+Grid : Message : 411.130536 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 411.130537 s : *********************************************************
+Grid : Message : 548.109968 s : Deo mflop/s =   1.55267e+08
+Grid : Message : 548.110008 s : Deo mflop/s per rank   2.42604e+06
+Grid : Message : 548.110010 s : Deo mflop/s per node   9.70418e+06
+Grid : Message : 548.110013 s : #### Dhop calls report 
+Grid : Message : 548.110015 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 548.110018 s : WilsonFermion5D TotalTime   /Calls        : 4565.66 us
+Grid : Message : 548.110020 s : WilsonFermion5D CommTime    /Calls        : 2996.68 us
+Grid : Message : 548.110022 s : WilsonFermion5D FaceTime    /Calls        : 574.72 us
+Grid : Message : 548.110024 s : WilsonFermion5D ComputeTime1/Calls        : 6.0247 us
+Grid : Message : 548.110026 s : WilsonFermion5D ComputeTime2/Calls        : 1020.13 us
+Grid : Message : 548.110050 s : Average mflops/s per call                : 1.03225e+11
+Grid : Message : 548.110054 s : Average mflops/s per call per rank       : 1.6129e+09
+Grid : Message : 548.110056 s : Average mflops/s per call per node       : 6.45159e+09
+Grid : Message : 548.110060 s : Average mflops/s per call (full)         : 1.58039e+08
+Grid : Message : 548.110063 s : Average mflops/s per call per rank (full): 2.46937e+06
+Grid : Message : 548.110067 s : Average mflops/s per call per node (full): 9.87746e+06
+Grid : Message : 548.110072 s : WilsonFermion5D Stencil
+Grid : Message : 548.110074 s : WilsonFermion5D StencilEven
+Grid : Message : 548.110076 s : WilsonFermion5D StencilOdd
+Grid : Message : 548.110078 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 548.110080 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 548.110087 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 548.182456 s : r_e6.02108
+Grid : Message : 548.190125 s : r_o6.02101
+Grid : Message : 548.196449 s : res12.0421
+Grid : Message : 548.875188 s : norm diff   0
+Grid : Message : 549.798793 s : norm diff even  0
+Grid : Message : 550.237927 s : norm diff odd   0
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/nodes
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/nodes
@ -0,0 +1 @@
+tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/script
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-16A-1140
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=16
+#SBATCH --ntasks=64
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 16 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1140
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.8 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.256 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/start-date
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:47:39 BST 2022
+epoch 1661222859
--- a/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/success
+++ b/2-racks/size-loc32/16-nodes/job/power-loc32-16A-1140.64718/success
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32`
				`@ -0,0 +1 @@`
				`tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]`