Initial commit

2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions
--- a/2-racks/size-loc32/8-nodes/cpu-mpi-wrapper.sh
+++ b/2-racks/size-loc32/8-nodes/cpu-mpi-wrapper.sh
@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+numa=${lrank}
+cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))"
+places="$(( lrank*16 )):$(( (lrank+1)*16 ))"
+
+BINDING="taskset -c ${cpus} numactl -m ${numa}"
+export OMP_PLACES=${places}
+
+echo "$(hostname) - ${lrank} binding='${BINDING}'"
+
+${BINDING} "$@"
--- a/2-racks/size-loc32/8-nodes/dwf_fp32.tok
+++ b/2-racks/size-loc32/8-nodes/dwf_fp32.tok
@ -0,0 +1 @@
+../dwf_fp32.tok
--- a/2-racks/size-loc32/8-nodes/gpu-mpi-wrapper.sh
+++ b/2-racks/size-loc32/8-nodes/gpu-mpi-wrapper.sh
@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+numa1=$(( 2 * lrank))
+numa2=$(( 2 * lrank + 1 ))
+netdev=mlx5_${lrank}:1
+
+export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
+export UCX_NET_DEVICES=${netdev}
+BINDING="--interleave=$numa1,$numa2"
+
+echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
+
+numactl ${BINDING} "$@"
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:04:27 BST 2022
+epoch 1661216667
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc276d7000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015208b908000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015208b540000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015208b04e000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015208ad24000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015208aa43000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015208a7e2000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015208b88f000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015208a402000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152088ca6000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001520888d6000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152088635000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015208850a000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000152088188000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152087f51000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152087d39000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152087b19000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000152087754000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000152087550000)
+	/lib64/ld-linux-x86-64.so.2 (0x000015208b758000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000152087348000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015208b7c3000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015208b7be000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015208723c000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152087032000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000152086e2e000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/log
@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1520a0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.451132 s : Grid Layout
+Grid : Message : 1.451136 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.451141 s : 	OpenMP threads       : 4
+Grid : Message : 1.451143 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.490207 s : Making s innermost grids
+Grid : Message : 1.546698 s : Initialising 4d RNG
+Grid : Message : 1.639951 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.639978 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.448983 s : Initialising 5d RNG
+Grid : Message : 3.857910 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 3.857941 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.528379 s : Initialised RNGs
+Grid : Message : 24.642198 s : Drawing gauge field
+Grid : Message : 25.370279 s : Random gauge initialised 
+Grid : Message : 25.386364 s : Setting up Cshift based reference 
+Grid : Message : 54.680530 s : *****************************************************************
+Grid : Message : 54.680554 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.680555 s : *****************************************************************
+Grid : Message : 54.680556 s : *****************************************************************
+Grid : Message : 54.680557 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.680558 s : * Vectorising space-time by 8
+Grid : Message : 54.680559 s : * VComplexF size is 64 B
+Grid : Message : 54.680560 s : * SINGLE precision 
+Grid : Message : 54.680563 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.680564 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.680565 s : *****************************************************************
+Grid : Message : 56.720636 s : Called warmup
+Grid : Message : 341.354661 s : Called Dw 30000 times in 2.84633e+08 us
+Grid : Message : 341.354717 s : mflop/s =   7.46929e+07
+Grid : Message : 341.354719 s : mflop/s per rank =  2.33415e+06
+Grid : Message : 341.354721 s : mflop/s per node =  9.33662e+06
+Grid : Message : 341.354723 s : RF  GiB/s (base 2) =   151774
+Grid : Message : 341.354725 s : mem GiB/s (base 2) =   94858.9
+Grid : Message : 341.358222 s : norm diff   1.07359e-13
+Grid : Message : 341.408574 s : #### Dhop calls report 
+Grid : Message : 341.408581 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 341.408584 s : WilsonFermion5D TotalTime   /Calls        : 4747.94 us
+Grid : Message : 341.408586 s : WilsonFermion5D CommTime    /Calls        : 3238.54 us
+Grid : Message : 341.408588 s : WilsonFermion5D FaceTime    /Calls        : 477.179 us
+Grid : Message : 341.408590 s : WilsonFermion5D ComputeTime1/Calls        : 5.20729 us
+Grid : Message : 341.408592 s : WilsonFermion5D ComputeTime2/Calls        : 1046.27 us
+Grid : Message : 341.408654 s : Average mflops/s per call                : 6.22503e+10
+Grid : Message : 341.408657 s : Average mflops/s per call per rank       : 1.94532e+09
+Grid : Message : 341.408659 s : Average mflops/s per call per node       : 7.78129e+09
+Grid : Message : 341.408661 s : Average mflops/s per call (full)         : 7.59861e+07
+Grid : Message : 341.408664 s : Average mflops/s per call per rank (full): 2.37457e+06
+Grid : Message : 341.408666 s : Average mflops/s per call per node (full): 9.49826e+06
+Grid : Message : 341.408668 s : WilsonFermion5D Stencil
+Grid : Message : 341.408669 s : WilsonFermion5D StencilEven
+Grid : Message : 341.408672 s : WilsonFermion5D StencilOdd
+Grid : Message : 341.408674 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 341.408676 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 341.408678 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 396.742581 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 396.742602 s : Called DwDag
+Grid : Message : 396.742603 s : norm dag result 12.0421
+Grid : Message : 396.756893 s : norm dag ref    12.0421
+Grid : Message : 396.773260 s : norm dag diff   7.28475e-14
+Grid : Message : 396.816075 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 397.193717 s : src_e0.499997
+Grid : Message : 397.577696 s : src_o0.500003
+Grid : Message : 397.675628 s : *********************************************************
+Grid : Message : 397.675631 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 397.675632 s : * Vectorising space-time by 8
+Grid : Message : 397.675633 s : * SINGLE precision 
+Grid : Message : 397.675634 s : * Using Overlapped Comms/Compute
+Grid : Message : 397.675635 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 397.675636 s : *********************************************************
+Grid : Message : 540.232805 s : Deo mflop/s =   7.45971e+07
+Grid : Message : 540.232832 s : Deo mflop/s per rank   2.33116e+06
+Grid : Message : 540.232834 s : Deo mflop/s per node   9.32463e+06
+Grid : Message : 540.232837 s : #### Dhop calls report 
+Grid : Message : 540.232839 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 540.232841 s : WilsonFermion5D TotalTime   /Calls        : 4751.62 us
+Grid : Message : 540.232843 s : WilsonFermion5D CommTime    /Calls        : 3173.44 us
+Grid : Message : 540.232845 s : WilsonFermion5D FaceTime    /Calls        : 604.695 us
+Grid : Message : 540.232847 s : WilsonFermion5D ComputeTime1/Calls        : 6.28629 us
+Grid : Message : 540.232849 s : WilsonFermion5D ComputeTime2/Calls        : 999.947 us
+Grid : Message : 540.232870 s : Average mflops/s per call                : 5.14652e+10
+Grid : Message : 540.232874 s : Average mflops/s per call per rank       : 1.60829e+09
+Grid : Message : 540.232878 s : Average mflops/s per call per node       : 6.43315e+09
+Grid : Message : 540.232881 s : Average mflops/s per call (full)         : 7.59272e+07
+Grid : Message : 540.232884 s : Average mflops/s per call per rank (full): 2.37273e+06
+Grid : Message : 540.232887 s : Average mflops/s per call per node (full): 9.49091e+06
+Grid : Message : 540.232890 s : WilsonFermion5D Stencil
+Grid : Message : 540.232892 s : WilsonFermion5D StencilEven
+Grid : Message : 540.232893 s : WilsonFermion5D StencilOdd
+Grid : Message : 540.232896 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 540.232897 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 540.232900 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 540.304917 s : r_e6.02113
+Grid : Message : 540.311850 s : r_o6.02101
+Grid : Message : 540.318515 s : res12.0421
+Grid : Message : 540.994922 s : norm diff   0
+Grid : Message : 541.747359 s : norm diff even  0
+Grid : Message : 542.139558 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1005
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1005
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 01:55:19 BST 2022
+epoch 1661216119
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1005.64673/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:17:01 BST 2022
+epoch 1661217421
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe20397000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e4e5e5f000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e4e5a97000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e4e55a5000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e4e527b000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e4e4f9a000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e4e4d39000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e4e5de6000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e4e4959000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e4e31fd000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e4e2e2d000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e4e2b8c000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e4e2a61000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014e4e26df000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e4e24a8000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e4e2290000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e4e2070000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014e4e1cab000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014e4e1aa7000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014e4e5caf000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014e4e189f000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e4e5d1a000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e4e5d15000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e4e1793000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e4e1589000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014e4e1385000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/log
@ -0,0 +1,254 @@
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x152d40000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.411137 s : Grid Layout
+Grid : Message : 1.411142 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.411149 s : 	OpenMP threads       : 4
+Grid : Message : 1.411152 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.450334 s : Making s innermost grids
+Grid : Message : 1.501343 s : Initialising 4d RNG
+Grid : Message : 1.598884 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.598907 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.629236 s : Initialising 5d RNG
+Grid : Message : 4.714710 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.715320 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.751504 s : Initialised RNGs
+Grid : Message : 24.602581 s : Drawing gauge field
+Grid : Message : 25.485290 s : Random gauge initialised 
+Grid : Message : 25.497324 s : Setting up Cshift based reference 
+Grid : Message : 54.590031 s : *****************************************************************
+Grid : Message : 54.590055 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.590056 s : *****************************************************************
+Grid : Message : 54.590057 s : *****************************************************************
+Grid : Message : 54.590058 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.590059 s : * Vectorising space-time by 8
+Grid : Message : 54.590060 s : * VComplexF size is 64 B
+Grid : Message : 54.590061 s : * SINGLE precision 
+Grid : Message : 54.590063 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.590064 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.590065 s : *****************************************************************
+Grid : Message : 56.600017 s : Called warmup
+Grid : Message : 340.439124 s : Called Dw 30000 times in 2.83839e+08 us
+Grid : Message : 340.439168 s : mflop/s =   7.4902e+07
+Grid : Message : 340.439170 s : mflop/s per rank =  2.34069e+06
+Grid : Message : 340.439172 s : mflop/s per node =  9.36276e+06
+Grid : Message : 340.439174 s : RF  GiB/s (base 2) =   152199
+Grid : Message : 340.439176 s : mem GiB/s (base 2) =   95124.5
+Grid : Message : 340.442672 s : norm diff   1.07359e-13
+Grid : Message : 340.492982 s : #### Dhop calls report 
+Grid : Message : 340.492989 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 340.492992 s : WilsonFermion5D TotalTime   /Calls        : 4734.47 us
+Grid : Message : 340.492994 s : WilsonFermion5D CommTime    /Calls        : 3215.16 us
+Grid : Message : 340.492996 s : WilsonFermion5D FaceTime    /Calls        : 476.312 us
+Grid : Message : 340.492998 s : WilsonFermion5D ComputeTime1/Calls        : 4.61805 us
+Grid : Message : 340.493000 s : WilsonFermion5D ComputeTime2/Calls        : 1057.09 us
+Grid : Message : 340.493034 s : Average mflops/s per call                : 6.20648e+10
+Grid : Message : 340.493039 s : Average mflops/s per call per rank       : 1.93952e+09
+Grid : Message : 340.493041 s : Average mflops/s per call per node       : 7.75809e+09
+Grid : Message : 340.493043 s : Average mflops/s per call (full)         : 7.62022e+07
+Grid : Message : 340.493050 s : Average mflops/s per call per rank (full): 2.38132e+06
+Grid : Message : 340.493054 s : Average mflops/s per call per node (full): 9.52528e+06
+Grid : Message : 340.493057 s : WilsonFermion5D Stencil
+Grid : Message : 340.493059 s : WilsonFermion5D StencilEven
+Grid : Message : 340.493061 s : WilsonFermion5D StencilOdd
+Grid : Message : 340.493064 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 340.493066 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 340.493068 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 395.685600 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 395.685621 s : Called DwDag
+Grid : Message : 395.685622 s : norm dag result 12.0421
+Grid : Message : 395.711061 s : norm dag ref    12.0421
+Grid : Message : 395.727365 s : norm dag diff   7.28475e-14
+Grid : Message : 395.777073 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 396.215583 s : src_e0.499997
+Grid : Message : 396.523749 s : src_o0.500003
+Grid : Message : 396.640132 s : *********************************************************
+Grid : Message : 396.640135 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 396.640136 s : * Vectorising space-time by 8
+Grid : Message : 396.640137 s : * SINGLE precision 
+Grid : Message : 396.640138 s : * Using Overlapped Comms/Compute
+Grid : Message : 396.640139 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 396.640140 s : *********************************************************
+Grid : Message : 538.477450 s : Deo mflop/s =   7.52017e+07
+Grid : Message : 538.477760 s : Deo mflop/s per rank   2.35005e+06
+Grid : Message : 538.477780 s : Deo mflop/s per node   9.40022e+06
+Grid : Message : 538.477810 s : #### Dhop calls report 
+Grid : Message : 538.477830 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 538.477850 s : WilsonFermion5D TotalTime   /Calls        : 4713.31 us
+Grid : Message : 538.477870 s : WilsonFermion5D CommTime    /Calls        : 3138.8 us
+Grid : Message : 538.477890 s : WilsonFermion5D FaceTime    /Calls        : 592.51 us
+Grid : Message : 538.477910 s : WilsonFermion5D ComputeTime1/Calls        : 5.73034 us
+Grid : Message : 538.477930 s : WilsonFermion5D ComputeTime2/Calls        : 1007.86 us
+Grid : Message : 538.478120 s : Average mflops/s per call                : 5.12899e+10
+Grid : Message : 538.478160 s : Average mflops/s per call per rank       : 1.60281e+09
+Grid : Message : 538.478180 s : Average mflops/s per call per node       : 6.41124e+09
+Grid : Message : 538.478200 s : Average mflops/s per call (full)         : 7.65444e+07
+Grid : Message : 538.478240 s : Average mflops/s per call per rank (full): 2.39201e+06
+Grid : Message : 538.478260 s : Average mflops/s per call per node (full): 9.56805e+06
+Grid : Message : 538.478290 s : WilsonFermion5D Stencil
+Grid : Message : 538.478300 s : WilsonFermion5D StencilEven
+Grid : Message : 538.478320 s : WilsonFermion5D StencilOdd
+Grid : Message : 538.478330 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 538.478350 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 538.478360 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 538.119186 s : r_e6.02113
+Grid : Message : 538.127316 s : r_o6.02101
+Grid : Message : 538.133936 s : res12.0421
+Grid : Message : 538.790970 s : norm diff   0
+Grid : Message : 539.605836 s : norm diff even  0
+Grid : Message : 539.988598 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1020
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1020
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:07:55 BST 2022
+epoch 1661216875
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1020.64677/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:29:38 BST 2022
+epoch 1661218178
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe2c559000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154b69193000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154b68dcb000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154b688d9000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154b685af000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154b682ce000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154b6806d000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154b6911a000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154b67c8d000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154b66531000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154b66161000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154b65ec0000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154b65d95000)
+	libm.so.6 => /lib64/libm.so.6 (0x0000154b65a13000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154b657dc000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154b655c4000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154b653a4000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000154b64fdf000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x0000154b64ddb000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000154b68fe3000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000154b64bd3000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154b6904e000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154b69049000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154b64ac7000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154b648bd000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000154b646b9000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/log
@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e380000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.487038 s : Grid Layout
+Grid : Message : 1.487042 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.487049 s : 	OpenMP threads       : 4
+Grid : Message : 1.487051 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.526579 s : Making s innermost grids
+Grid : Message : 1.575050 s : Initialising 4d RNG
+Grid : Message : 1.668970 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.668993 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.690908 s : Initialising 5d RNG
+Grid : Message : 4.186060 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.186090 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 21.108778 s : Initialised RNGs
+Grid : Message : 24.628384 s : Drawing gauge field
+Grid : Message : 25.244935 s : Random gauge initialised 
+Grid : Message : 25.260871 s : Setting up Cshift based reference 
+Grid : Message : 54.297463 s : *****************************************************************
+Grid : Message : 54.297490 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.297492 s : *****************************************************************
+Grid : Message : 54.297493 s : *****************************************************************
+Grid : Message : 54.297494 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.297495 s : * Vectorising space-time by 8
+Grid : Message : 54.297496 s : * VComplexF size is 64 B
+Grid : Message : 54.297498 s : * SINGLE precision 
+Grid : Message : 54.297500 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.297501 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.297502 s : *****************************************************************
+Grid : Message : 56.289649 s : Called warmup
+Grid : Message : 339.475576 s : Called Dw 30000 times in 2.83186e+08 us
+Grid : Message : 339.475624 s : mflop/s =   7.50747e+07
+Grid : Message : 339.475626 s : mflop/s per rank =  2.34608e+06
+Grid : Message : 339.475628 s : mflop/s per node =  9.38434e+06
+Grid : Message : 339.475630 s : RF  GiB/s (base 2) =   152550
+Grid : Message : 339.475632 s : mem GiB/s (base 2) =   95343.7
+Grid : Message : 339.479133 s : norm diff   1.07359e-13
+Grid : Message : 339.528508 s : #### Dhop calls report 
+Grid : Message : 339.528515 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 339.528519 s : WilsonFermion5D TotalTime   /Calls        : 4723.23 us
+Grid : Message : 339.528521 s : WilsonFermion5D CommTime    /Calls        : 3196.3 us
+Grid : Message : 339.528523 s : WilsonFermion5D FaceTime    /Calls        : 478.284 us
+Grid : Message : 339.528525 s : WilsonFermion5D ComputeTime1/Calls        : 4.58175 us
+Grid : Message : 339.528527 s : WilsonFermion5D ComputeTime2/Calls        : 1062.24 us
+Grid : Message : 339.528552 s : Average mflops/s per call                : 6.12426e+10
+Grid : Message : 339.528556 s : Average mflops/s per call per rank       : 1.91383e+09
+Grid : Message : 339.528558 s : Average mflops/s per call per node       : 7.65533e+09
+Grid : Message : 339.528560 s : Average mflops/s per call (full)         : 7.63836e+07
+Grid : Message : 339.528564 s : Average mflops/s per call per rank (full): 2.38699e+06
+Grid : Message : 339.528567 s : Average mflops/s per call per node (full): 9.54795e+06
+Grid : Message : 339.528569 s : WilsonFermion5D Stencil
+Grid : Message : 339.528572 s : WilsonFermion5D StencilEven
+Grid : Message : 339.528575 s : WilsonFermion5D StencilOdd
+Grid : Message : 339.528576 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 339.528578 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 339.528580 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 394.933228 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 394.933253 s : Called DwDag
+Grid : Message : 394.933254 s : norm dag result 12.0421
+Grid : Message : 394.953559 s : norm dag ref    12.0421
+Grid : Message : 394.969769 s : norm dag diff   7.28475e-14
+Grid : Message : 395.189670 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 395.356222 s : src_e0.499997
+Grid : Message : 395.800392 s : src_o0.500003
+Grid : Message : 395.896090 s : *********************************************************
+Grid : Message : 395.896093 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 395.896094 s : * Vectorising space-time by 8
+Grid : Message : 395.896096 s : * SINGLE precision 
+Grid : Message : 395.896097 s : * Using Overlapped Comms/Compute
+Grid : Message : 395.896098 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 395.896099 s : *********************************************************
+Grid : Message : 536.452166 s : Deo mflop/s =   7.56569e+07
+Grid : Message : 536.452197 s : Deo mflop/s per rank   2.36428e+06
+Grid : Message : 536.452199 s : Deo mflop/s per node   9.45711e+06
+Grid : Message : 536.452202 s : #### Dhop calls report 
+Grid : Message : 536.452204 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 536.452206 s : WilsonFermion5D TotalTime   /Calls        : 4684.93 us
+Grid : Message : 536.452208 s : WilsonFermion5D CommTime    /Calls        : 3112.34 us
+Grid : Message : 536.452210 s : WilsonFermion5D FaceTime    /Calls        : 591.04 us
+Grid : Message : 536.452212 s : WilsonFermion5D ComputeTime1/Calls        : 5.70331 us
+Grid : Message : 536.452214 s : WilsonFermion5D ComputeTime2/Calls        : 1007.07 us
+Grid : Message : 536.452239 s : Average mflops/s per call                : 5.15026e+10
+Grid : Message : 536.452243 s : Average mflops/s per call per rank       : 1.60946e+09
+Grid : Message : 536.452245 s : Average mflops/s per call per node       : 6.43783e+09
+Grid : Message : 536.452247 s : Average mflops/s per call (full)         : 7.70081e+07
+Grid : Message : 536.452252 s : Average mflops/s per call per rank (full): 2.4065e+06
+Grid : Message : 536.452256 s : Average mflops/s per call per node (full): 9.62601e+06
+Grid : Message : 536.452259 s : WilsonFermion5D Stencil
+Grid : Message : 536.452262 s : WilsonFermion5D StencilEven
+Grid : Message : 536.452264 s : WilsonFermion5D StencilOdd
+Grid : Message : 536.452267 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 536.452270 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 536.452271 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 536.525206 s : r_e6.02113
+Grid : Message : 536.532317 s : r_o6.02101
+Grid : Message : 536.538894 s : res12.0421
+Grid : Message : 537.137938 s : norm diff   0
+Grid : Message : 537.903953 s : norm diff even  0
+Grid : Message : 538.313669 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1035
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1035
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:20:33 BST 2022
+epoch 1661217633
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1035.64681/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:42:07 BST 2022
+epoch 1661218927
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc35509000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001501d8950000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001501d8588000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001501d8096000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001501d7d6c000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001501d7a8b000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001501d782a000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001501d88d7000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001501d744a000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001501d5cee000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001501d591e000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001501d567d000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001501d5552000)
+	libm.so.6 => /lib64/libm.so.6 (0x00001501d51d0000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001501d4f99000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001501d4d81000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001501d4b61000)
+	libc.so.6 => /lib64/libc.so.6 (0x00001501d479c000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x00001501d4598000)
+	/lib64/ld-linux-x86-64.so.2 (0x00001501d87a0000)
+	librt.so.1 => /lib64/librt.so.1 (0x00001501d4390000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001501d880b000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001501d8806000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001501d4284000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001501d407a000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x00001501d3e76000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/log
@ -0,0 +1,254 @@
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14aa80000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.383615 s : Grid Layout
+Grid : Message : 1.383619 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.383627 s : 	OpenMP threads       : 4
+Grid : Message : 1.383630 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.426416 s : Making s innermost grids
+Grid : Message : 1.472587 s : Initialising 4d RNG
+Grid : Message : 1.567580 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.567607 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.379949 s : Initialising 5d RNG
+Grid : Message : 3.819686 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 3.819712 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.471267 s : Initialised RNGs
+Grid : Message : 25.497600 s : Drawing gauge field
+Grid : Message : 25.826925 s : Random gauge initialised 
+Grid : Message : 25.842484 s : Setting up Cshift based reference 
+Grid : Message : 54.870795 s : *****************************************************************
+Grid : Message : 54.870825 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.870827 s : *****************************************************************
+Grid : Message : 54.870829 s : *****************************************************************
+Grid : Message : 54.870830 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.870839 s : * Vectorising space-time by 8
+Grid : Message : 54.870841 s : * VComplexF size is 64 B
+Grid : Message : 54.870843 s : * SINGLE precision 
+Grid : Message : 54.870846 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.870848 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.870850 s : *****************************************************************
+Grid : Message : 56.943818 s : Called warmup
+Grid : Message : 339.527765 s : Called Dw 30000 times in 2.82583e+08 us
+Grid : Message : 339.527813 s : mflop/s =   7.52349e+07
+Grid : Message : 339.527816 s : mflop/s per rank =  2.35109e+06
+Grid : Message : 339.527823 s : mflop/s per node =  9.40436e+06
+Grid : Message : 339.527826 s : RF  GiB/s (base 2) =   152876
+Grid : Message : 339.527830 s : mem GiB/s (base 2) =   95547.2
+Grid : Message : 339.531335 s : norm diff   1.07359e-13
+Grid : Message : 339.580818 s : #### Dhop calls report 
+Grid : Message : 339.580824 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 339.580827 s : WilsonFermion5D TotalTime   /Calls        : 4713.67 us
+Grid : Message : 339.580829 s : WilsonFermion5D CommTime    /Calls        : 3193.06 us
+Grid : Message : 339.580831 s : WilsonFermion5D FaceTime    /Calls        : 475.013 us
+Grid : Message : 339.580833 s : WilsonFermion5D ComputeTime1/Calls        : 4.81574 us
+Grid : Message : 339.580835 s : WilsonFermion5D ComputeTime2/Calls        : 1059.46 us
+Grid : Message : 339.580923 s : Average mflops/s per call                : 6.07786e+10
+Grid : Message : 339.580927 s : Average mflops/s per call per rank       : 1.89933e+09
+Grid : Message : 339.580929 s : Average mflops/s per call per node       : 7.59733e+09
+Grid : Message : 339.580931 s : Average mflops/s per call (full)         : 7.65385e+07
+Grid : Message : 339.580933 s : Average mflops/s per call per rank (full): 2.39183e+06
+Grid : Message : 339.580937 s : Average mflops/s per call per node (full): 9.56731e+06
+Grid : Message : 339.580940 s : WilsonFermion5D Stencil
+Grid : Message : 339.580942 s : WilsonFermion5D StencilEven
+Grid : Message : 339.580944 s : WilsonFermion5D StencilOdd
+Grid : Message : 339.580945 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 339.580947 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 339.580949 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 394.987790 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 394.987814 s : Called DwDag
+Grid : Message : 394.987815 s : norm dag result 12.0421
+Grid : Message : 395.185510 s : norm dag ref    12.0421
+Grid : Message : 395.346940 s : norm dag diff   7.28475e-14
+Grid : Message : 395.773530 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 395.465746 s : src_e0.499997
+Grid : Message : 395.917171 s : src_o0.500003
+Grid : Message : 396.574200 s : *********************************************************
+Grid : Message : 396.574240 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 396.574250 s : * Vectorising space-time by 8
+Grid : Message : 396.574260 s : * SINGLE precision 
+Grid : Message : 396.574270 s : * Using Overlapped Comms/Compute
+Grid : Message : 396.574280 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 396.574290 s : *********************************************************
+Grid : Message : 535.834881 s : Deo mflop/s =   7.60799e+07
+Grid : Message : 535.834913 s : Deo mflop/s per rank   2.3775e+06
+Grid : Message : 535.834915 s : Deo mflop/s per node   9.50999e+06
+Grid : Message : 535.834918 s : #### Dhop calls report 
+Grid : Message : 535.834920 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 535.834922 s : WilsonFermion5D TotalTime   /Calls        : 4658.97 us
+Grid : Message : 535.834924 s : WilsonFermion5D CommTime    /Calls        : 3090.81 us
+Grid : Message : 535.834930 s : WilsonFermion5D FaceTime    /Calls        : 583.833 us
+Grid : Message : 535.834935 s : WilsonFermion5D ComputeTime1/Calls        : 5.88087 us
+Grid : Message : 535.834939 s : WilsonFermion5D ComputeTime2/Calls        : 1011.38 us
+Grid : Message : 535.834960 s : Average mflops/s per call                : 5.08454e+10
+Grid : Message : 535.834964 s : Average mflops/s per call per rank       : 1.58892e+09
+Grid : Message : 535.834966 s : Average mflops/s per call per node       : 6.35567e+09
+Grid : Message : 535.834969 s : Average mflops/s per call (full)         : 7.74371e+07
+Grid : Message : 535.834973 s : Average mflops/s per call per rank (full): 2.41991e+06
+Grid : Message : 535.834975 s : Average mflops/s per call per node (full): 9.67963e+06
+Grid : Message : 535.834978 s : WilsonFermion5D Stencil
+Grid : Message : 535.834979 s : WilsonFermion5D StencilEven
+Grid : Message : 535.834981 s : WilsonFermion5D StencilOdd
+Grid : Message : 535.834983 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 535.834985 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 535.834988 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 535.907590 s : r_e6.02113
+Grid : Message : 535.918975 s : r_o6.02101
+Grid : Message : 535.925515 s : res12.0421
+Grid : Message : 536.576844 s : norm diff   0
+Grid : Message : 537.448681 s : norm diff even  0
+Grid : Message : 537.774321 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1050
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1050
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:33:03 BST 2022
+epoch 1661218383
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1050.64687/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:54:33 BST 2022
+epoch 1661219673
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc084f3000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e7b474a000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e7b4382000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e7b3e90000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e7b3b66000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e7b3885000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e7b3624000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e7b46d1000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e7b3244000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e7b1ae8000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e7b1718000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e7b1477000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e7b134c000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014e7b0fca000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e7b0d93000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e7b0b7b000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e7b095b000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014e7b0596000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014e7b0392000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014e7b459a000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014e7b018a000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e7b4605000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e7b4600000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e7b007e000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e7afe74000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014e7afc70000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/log
@ -0,0 +1,254 @@
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14eea0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.366393 s : Grid Layout
+Grid : Message : 1.366397 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.366402 s : 	OpenMP threads       : 4
+Grid : Message : 1.366404 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.407148 s : Making s innermost grids
+Grid : Message : 1.464257 s : Initialising 4d RNG
+Grid : Message : 1.557730 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.557758 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.356467 s : Initialising 5d RNG
+Grid : Message : 3.801979 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 3.802012 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.483524 s : Initialised RNGs
+Grid : Message : 24.230918 s : Drawing gauge field
+Grid : Message : 25.177490 s : Random gauge initialised 
+Grid : Message : 25.295480 s : Setting up Cshift based reference 
+Grid : Message : 54.973180 s : *****************************************************************
+Grid : Message : 54.973410 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.973430 s : *****************************************************************
+Grid : Message : 54.973440 s : *****************************************************************
+Grid : Message : 54.973490 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.973510 s : * Vectorising space-time by 8
+Grid : Message : 54.973530 s : * VComplexF size is 64 B
+Grid : Message : 54.973560 s : * SINGLE precision 
+Grid : Message : 54.973580 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.973600 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.973630 s : *****************************************************************
+Grid : Message : 56.188907 s : Called warmup
+Grid : Message : 338.221994 s : Called Dw 30000 times in 2.82032e+08 us
+Grid : Message : 338.222041 s : mflop/s =   7.53818e+07
+Grid : Message : 338.222043 s : mflop/s per rank =  2.35568e+06
+Grid : Message : 338.222045 s : mflop/s per node =  9.42273e+06
+Grid : Message : 338.222047 s : RF  GiB/s (base 2) =   153174
+Grid : Message : 338.222049 s : mem GiB/s (base 2) =   95733.8
+Grid : Message : 338.225548 s : norm diff   1.07359e-13
+Grid : Message : 338.275111 s : #### Dhop calls report 
+Grid : Message : 338.275118 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 338.275121 s : WilsonFermion5D TotalTime   /Calls        : 4704.45 us
+Grid : Message : 338.275123 s : WilsonFermion5D CommTime    /Calls        : 3197.7 us
+Grid : Message : 338.275125 s : WilsonFermion5D FaceTime    /Calls        : 471.81 us
+Grid : Message : 338.275127 s : WilsonFermion5D ComputeTime1/Calls        : 5.0956 us
+Grid : Message : 338.275129 s : WilsonFermion5D ComputeTime2/Calls        : 1048.58 us
+Grid : Message : 338.275196 s : Average mflops/s per call                : 6.11343e+10
+Grid : Message : 338.275200 s : Average mflops/s per call per rank       : 1.91045e+09
+Grid : Message : 338.275202 s : Average mflops/s per call per node       : 7.64179e+09
+Grid : Message : 338.275204 s : Average mflops/s per call (full)         : 7.66885e+07
+Grid : Message : 338.275206 s : Average mflops/s per call per rank (full): 2.39652e+06
+Grid : Message : 338.275208 s : Average mflops/s per call per node (full): 9.58606e+06
+Grid : Message : 338.275211 s : WilsonFermion5D Stencil
+Grid : Message : 338.275212 s : WilsonFermion5D StencilEven
+Grid : Message : 338.275213 s : WilsonFermion5D StencilOdd
+Grid : Message : 338.275214 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 338.275215 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 338.275216 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 393.586448 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 393.586467 s : Called DwDag
+Grid : Message : 393.586468 s : norm dag result 12.0421
+Grid : Message : 393.600340 s : norm dag ref    12.0421
+Grid : Message : 393.616373 s : norm dag diff   7.28475e-14
+Grid : Message : 393.662063 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 394.592240 s : src_e0.499997
+Grid : Message : 394.431906 s : src_o0.500003
+Grid : Message : 394.530690 s : *********************************************************
+Grid : Message : 394.530693 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 394.530694 s : * Vectorising space-time by 8
+Grid : Message : 394.530695 s : * SINGLE precision 
+Grid : Message : 394.530696 s : * Using Overlapped Comms/Compute
+Grid : Message : 394.530697 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 394.530698 s : *********************************************************
+Grid : Message : 534.917450 s : Deo mflop/s =   7.61957e+07
+Grid : Message : 534.917770 s : Deo mflop/s per rank   2.38112e+06
+Grid : Message : 534.917790 s : Deo mflop/s per node   9.52446e+06
+Grid : Message : 534.917860 s : #### Dhop calls report 
+Grid : Message : 534.917900 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 534.917930 s : WilsonFermion5D TotalTime   /Calls        : 4651.72 us
+Grid : Message : 534.917970 s : WilsonFermion5D CommTime    /Calls        : 3091.21 us
+Grid : Message : 534.918000 s : WilsonFermion5D FaceTime    /Calls        : 583.213 us
+Grid : Message : 534.918030 s : WilsonFermion5D ComputeTime1/Calls        : 6.04677 us
+Grid : Message : 534.918070 s : WilsonFermion5D ComputeTime2/Calls        : 1003.12 us
+Grid : Message : 534.918280 s : Average mflops/s per call                : 5.15468e+10
+Grid : Message : 534.918320 s : Average mflops/s per call per rank       : 1.61084e+09
+Grid : Message : 534.918350 s : Average mflops/s per call per node       : 6.44335e+09
+Grid : Message : 534.918380 s : Average mflops/s per call (full)         : 7.75578e+07
+Grid : Message : 534.918410 s : Average mflops/s per call per rank (full): 2.42368e+06
+Grid : Message : 534.918450 s : Average mflops/s per call per node (full): 9.69473e+06
+Grid : Message : 534.918480 s : WilsonFermion5D Stencil
+Grid : Message : 534.918510 s : WilsonFermion5D StencilEven
+Grid : Message : 534.918520 s : WilsonFermion5D StencilOdd
+Grid : Message : 534.918540 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 534.918560 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 534.918590 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 534.162791 s : r_e6.02113
+Grid : Message : 534.171110 s : r_o6.02101
+Grid : Message : 534.177848 s : res12.0421
+Grid : Message : 534.858243 s : norm diff   0
+Grid : Message : 535.620756 s : norm diff even  0
+Grid : Message : 536.317800 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1065
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1065
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:45:31 BST 2022
+epoch 1661219131
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1065.64695/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:07:02 BST 2022
+epoch 1661220422
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffcea739000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000155068944000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015506857c000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015506808a000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000155067d60000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000155067a7f000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015506781e000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001550688cb000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015506743e000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000155065ce2000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000155065912000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000155065671000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000155065546000)
+	libm.so.6 => /lib64/libm.so.6 (0x00001550651c4000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000155064f8d000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000155064d75000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000155064b55000)
+	libc.so.6 => /lib64/libc.so.6 (0x0000155064790000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000015506458c000)
+	/lib64/ld-linux-x86-64.so.2 (0x0000155068794000)
+	librt.so.1 => /lib64/librt.so.1 (0x0000155064384000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001550687ff000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001550687fa000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000155064278000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015506406e000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x0000155063e6a000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/log
@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1508e0000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.278711 s : Grid Layout
+Grid : Message : 1.278716 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.278724 s : 	OpenMP threads       : 4
+Grid : Message : 1.278728 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.317967 s : Making s innermost grids
+Grid : Message : 1.383230 s : Initialising 4d RNG
+Grid : Message : 1.475617 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.475643 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.275104 s : Initialising 5d RNG
+Grid : Message : 3.714759 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 3.714789 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.388702 s : Initialised RNGs
+Grid : Message : 24.670402 s : Drawing gauge field
+Grid : Message : 25.467328 s : Random gauge initialised 
+Grid : Message : 25.482764 s : Setting up Cshift based reference 
+Grid : Message : 54.598912 s : *****************************************************************
+Grid : Message : 54.598935 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.598936 s : *****************************************************************
+Grid : Message : 54.598937 s : *****************************************************************
+Grid : Message : 54.598938 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.598939 s : * Vectorising space-time by 8
+Grid : Message : 54.598940 s : * VComplexF size is 64 B
+Grid : Message : 54.598941 s : * SINGLE precision 
+Grid : Message : 54.598943 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.598944 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.598945 s : *****************************************************************
+Grid : Message : 56.670371 s : Called warmup
+Grid : Message : 338.287712 s : Called Dw 30000 times in 2.81616e+08 us
+Grid : Message : 338.287768 s : mflop/s =   7.54932e+07
+Grid : Message : 338.287770 s : mflop/s per rank =  2.35916e+06
+Grid : Message : 338.287772 s : mflop/s per node =  9.43665e+06
+Grid : Message : 338.287774 s : RF  GiB/s (base 2) =   153400
+Grid : Message : 338.287776 s : mem GiB/s (base 2) =   95875.3
+Grid : Message : 338.291283 s : norm diff   1.07359e-13
+Grid : Message : 338.340583 s : #### Dhop calls report 
+Grid : Message : 338.340590 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 338.340593 s : WilsonFermion5D TotalTime   /Calls        : 4697.65 us
+Grid : Message : 338.340597 s : WilsonFermion5D CommTime    /Calls        : 3173.49 us
+Grid : Message : 338.340601 s : WilsonFermion5D FaceTime    /Calls        : 473.12 us
+Grid : Message : 338.340603 s : WilsonFermion5D ComputeTime1/Calls        : 5.07821 us
+Grid : Message : 338.340606 s : WilsonFermion5D ComputeTime2/Calls        : 1065.26 us
+Grid : Message : 338.340704 s : Average mflops/s per call                : 6.05431e+10
+Grid : Message : 338.340708 s : Average mflops/s per call per rank       : 1.89197e+09
+Grid : Message : 338.340710 s : Average mflops/s per call per node       : 7.56789e+09
+Grid : Message : 338.340713 s : Average mflops/s per call (full)         : 7.67995e+07
+Grid : Message : 338.340716 s : Average mflops/s per call per rank (full): 2.39998e+06
+Grid : Message : 338.340719 s : Average mflops/s per call per node (full): 9.59993e+06
+Grid : Message : 338.340722 s : WilsonFermion5D Stencil
+Grid : Message : 338.340723 s : WilsonFermion5D StencilEven
+Grid : Message : 338.340724 s : WilsonFermion5D StencilOdd
+Grid : Message : 338.340725 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 338.340729 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 338.340730 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 393.531951 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 393.531972 s : Called DwDag
+Grid : Message : 393.531973 s : norm dag result 12.0421
+Grid : Message : 393.550274 s : norm dag ref    12.0421
+Grid : Message : 393.566206 s : norm dag diff   7.28475e-14
+Grid : Message : 393.614226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 393.987152 s : src_e0.499997
+Grid : Message : 394.411352 s : src_o0.500003
+Grid : Message : 394.510104 s : *********************************************************
+Grid : Message : 394.510107 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 394.510108 s : * Vectorising space-time by 8
+Grid : Message : 394.510109 s : * SINGLE precision 
+Grid : Message : 394.510110 s : * Using Overlapped Comms/Compute
+Grid : Message : 394.510111 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 394.510112 s : *********************************************************
+Grid : Message : 533.445236 s : Deo mflop/s =   7.65411e+07
+Grid : Message : 533.445265 s : Deo mflop/s per rank   2.39191e+06
+Grid : Message : 533.445267 s : Deo mflop/s per node   9.56764e+06
+Grid : Message : 533.445270 s : #### Dhop calls report 
+Grid : Message : 533.445272 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 533.445274 s : WilsonFermion5D TotalTime   /Calls        : 4630.89 us
+Grid : Message : 533.445276 s : WilsonFermion5D CommTime    /Calls        : 3066.01 us
+Grid : Message : 533.445278 s : WilsonFermion5D FaceTime    /Calls        : 580.865 us
+Grid : Message : 533.445280 s : WilsonFermion5D ComputeTime1/Calls        : 6.1052 us
+Grid : Message : 533.445282 s : WilsonFermion5D ComputeTime2/Calls        : 1011.57 us
+Grid : Message : 533.445307 s : Average mflops/s per call                : 5.14748e+10
+Grid : Message : 533.445313 s : Average mflops/s per call per rank       : 1.60859e+09
+Grid : Message : 533.445316 s : Average mflops/s per call per node       : 6.43435e+09
+Grid : Message : 533.445319 s : Average mflops/s per call (full)         : 7.79067e+07
+Grid : Message : 533.445323 s : Average mflops/s per call per rank (full): 2.43458e+06
+Grid : Message : 533.445326 s : Average mflops/s per call per node (full): 9.73833e+06
+Grid : Message : 533.445328 s : WilsonFermion5D Stencil
+Grid : Message : 533.445330 s : WilsonFermion5D StencilEven
+Grid : Message : 533.445332 s : WilsonFermion5D StencilOdd
+Grid : Message : 533.445334 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 533.445336 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 533.445337 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 533.517980 s : r_e6.02113
+Grid : Message : 533.525139 s : r_o6.02101
+Grid : Message : 533.531555 s : res12.0421
+Grid : Message : 534.208348 s : norm diff   0
+Grid : Message : 534.958399 s : norm diff even  0
+Grid : Message : 535.407067 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1080
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1080
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 02:58:01 BST 2022
+epoch 1661219881
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1080.64699/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:19:28 BST 2022
+epoch 1661221168
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe10fbb000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014e051810000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014e051448000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014e050f56000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014e050c2c000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014e05094b000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014e0506ea000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014e051797000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014e05030a000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014e04ebae000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014e04e7de000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014e04e53d000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014e04e412000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014e04e090000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014e04de59000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014e04dc41000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014e04da21000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014e04d65c000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014e04d458000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014e051660000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014e04d250000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014e0516cb000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014e0516c6000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014e04d144000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014e04cf3a000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014e04cd36000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/log
@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x145b60000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.417949 s : Grid Layout
+Grid : Message : 1.417957 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.417963 s : 	OpenMP threads       : 4
+Grid : Message : 1.417965 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.456030 s : Making s innermost grids
+Grid : Message : 1.519833 s : Initialising 4d RNG
+Grid : Message : 1.609461 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.609488 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.407737 s : Initialising 5d RNG
+Grid : Message : 3.807194 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 3.807228 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.500197 s : Initialised RNGs
+Grid : Message : 25.140001 s : Drawing gauge field
+Grid : Message : 25.625310 s : Random gauge initialised 
+Grid : Message : 25.637123 s : Setting up Cshift based reference 
+Grid : Message : 54.900199 s : *****************************************************************
+Grid : Message : 54.900217 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.900219 s : *****************************************************************
+Grid : Message : 54.900220 s : *****************************************************************
+Grid : Message : 54.900221 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.900222 s : * Vectorising space-time by 8
+Grid : Message : 54.900223 s : * VComplexF size is 64 B
+Grid : Message : 54.900224 s : * SINGLE precision 
+Grid : Message : 54.900226 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.900227 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.900228 s : *****************************************************************
+Grid : Message : 56.902046 s : Called warmup
+Grid : Message : 338.493870 s : Called Dw 30000 times in 2.81147e+08 us
+Grid : Message : 338.494560 s : mflop/s =   7.56192e+07
+Grid : Message : 338.494630 s : mflop/s per rank =  2.3631e+06
+Grid : Message : 338.494660 s : mflop/s per node =  9.4524e+06
+Grid : Message : 338.494690 s : RF  GiB/s (base 2) =   153656
+Grid : Message : 338.494720 s : mem GiB/s (base 2) =   96035.3
+Grid : Message : 338.529840 s : norm diff   1.07359e-13
+Grid : Message : 338.101662 s : #### Dhop calls report 
+Grid : Message : 338.101668 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 338.101671 s : WilsonFermion5D TotalTime   /Calls        : 4689.71 us
+Grid : Message : 338.101673 s : WilsonFermion5D CommTime    /Calls        : 3173.53 us
+Grid : Message : 338.101675 s : WilsonFermion5D FaceTime    /Calls        : 471.412 us
+Grid : Message : 338.101677 s : WilsonFermion5D ComputeTime1/Calls        : 4.96082 us
+Grid : Message : 338.101679 s : WilsonFermion5D ComputeTime2/Calls        : 1058.91 us
+Grid : Message : 338.101768 s : Average mflops/s per call                : 6.14534e+10
+Grid : Message : 338.101771 s : Average mflops/s per call per rank       : 1.92042e+09
+Grid : Message : 338.101773 s : Average mflops/s per call per node       : 7.68168e+09
+Grid : Message : 338.101775 s : Average mflops/s per call (full)         : 7.69295e+07
+Grid : Message : 338.101777 s : Average mflops/s per call per rank (full): 2.40405e+06
+Grid : Message : 338.101779 s : Average mflops/s per call per node (full): 9.61619e+06
+Grid : Message : 338.101781 s : WilsonFermion5D Stencil
+Grid : Message : 338.101782 s : WilsonFermion5D StencilEven
+Grid : Message : 338.101783 s : WilsonFermion5D StencilOdd
+Grid : Message : 338.101784 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 338.101785 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 338.101786 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 393.332960 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 393.333200 s : Called DwDag
+Grid : Message : 393.333210 s : norm dag result 12.0421
+Grid : Message : 393.535800 s : norm dag ref    12.0421
+Grid : Message : 393.694460 s : norm dag diff   7.28475e-14
+Grid : Message : 393.117660 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 393.554222 s : src_e0.499997
+Grid : Message : 393.883369 s : src_o0.500003
+Grid : Message : 393.981150 s : *********************************************************
+Grid : Message : 393.981152 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 393.981154 s : * Vectorising space-time by 8
+Grid : Message : 393.981155 s : * SINGLE precision 
+Grid : Message : 393.981156 s : * Using Overlapped Comms/Compute
+Grid : Message : 393.981157 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 393.981158 s : *********************************************************
+Grid : Message : 532.261495 s : Deo mflop/s =   7.69025e+07
+Grid : Message : 532.261527 s : Deo mflop/s per rank   2.4032e+06
+Grid : Message : 532.261529 s : Deo mflop/s per node   9.61281e+06
+Grid : Message : 532.261532 s : #### Dhop calls report 
+Grid : Message : 532.261534 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 532.261536 s : WilsonFermion5D TotalTime   /Calls        : 4609.05 us
+Grid : Message : 532.261538 s : WilsonFermion5D CommTime    /Calls        : 3043.36 us
+Grid : Message : 532.261540 s : WilsonFermion5D FaceTime    /Calls        : 576.704 us
+Grid : Message : 532.261542 s : WilsonFermion5D ComputeTime1/Calls        : 6.1045 us
+Grid : Message : 532.261544 s : WilsonFermion5D ComputeTime2/Calls        : 1016.01 us
+Grid : Message : 532.261569 s : Average mflops/s per call                : 5.12445e+10
+Grid : Message : 532.261572 s : Average mflops/s per call per rank       : 1.60139e+09
+Grid : Message : 532.261574 s : Average mflops/s per call per node       : 6.40556e+09
+Grid : Message : 532.261576 s : Average mflops/s per call (full)         : 7.82759e+07
+Grid : Message : 532.261578 s : Average mflops/s per call per rank (full): 2.44612e+06
+Grid : Message : 532.261580 s : Average mflops/s per call per node (full): 9.78449e+06
+Grid : Message : 532.261582 s : WilsonFermion5D Stencil
+Grid : Message : 532.261583 s : WilsonFermion5D StencilEven
+Grid : Message : 532.261585 s : WilsonFermion5D StencilOdd
+Grid : Message : 532.261586 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 532.261587 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 532.261588 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 532.334163 s : r_e6.02113
+Grid : Message : 532.341225 s : r_o6.02101
+Grid : Message : 532.347608 s : res12.0421
+Grid : Message : 533.303030 s : norm diff   0
+Grid : Message : 533.802608 s : norm diff even  0
+Grid : Message : 534.170331 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1095
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1095
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:10:28 BST 2022
+epoch 1661220628
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1095.64704/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:31:53 BST 2022
+epoch 1661221913
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffe747d5000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a1749b3000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a1745eb000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a1740f9000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a173dcf000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a173aee000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a17388d000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a17493a000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a1734ad000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a171d51000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a171981000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a1716e0000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a1715b5000)
+	libm.so.6 => /lib64/libm.so.6 (0x000014a171233000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a170ffc000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a170de4000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a170bc4000)
+	libc.so.6 => /lib64/libc.so.6 (0x000014a1707ff000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000014a1705fb000)
+	/lib64/ld-linux-x86-64.so.2 (0x000014a174803000)
+	librt.so.1 => /lib64/librt.so.1 (0x000014a1703f3000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a17486e000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a174869000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a1702e7000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a1700dd000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000014a16fed9000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/log
@ -0,0 +1,254 @@
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150400000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.553137 s : Grid Layout
+Grid : Message : 1.553140 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.553144 s : 	OpenMP threads       : 4
+Grid : Message : 1.553146 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.591415 s : Making s innermost grids
+Grid : Message : 1.643527 s : Initialising 4d RNG
+Grid : Message : 1.733769 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.733793 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.687944 s : Initialising 5d RNG
+Grid : Message : 4.932540 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.933190 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 21.773430 s : Initialised RNGs
+Grid : Message : 24.775839 s : Drawing gauge field
+Grid : Message : 25.506673 s : Random gauge initialised 
+Grid : Message : 25.516620 s : Setting up Cshift based reference 
+Grid : Message : 54.420958 s : *****************************************************************
+Grid : Message : 54.420980 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.420981 s : *****************************************************************
+Grid : Message : 54.420982 s : *****************************************************************
+Grid : Message : 54.420983 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.420984 s : * Vectorising space-time by 8
+Grid : Message : 54.420985 s : * VComplexF size is 64 B
+Grid : Message : 54.420986 s : * SINGLE precision 
+Grid : Message : 54.420987 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.420988 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.420989 s : *****************************************************************
+Grid : Message : 56.418238 s : Called warmup
+Grid : Message : 337.944860 s : Called Dw 30000 times in 2.80676e+08 us
+Grid : Message : 337.945460 s : mflop/s =   7.5746e+07
+Grid : Message : 337.945480 s : mflop/s per rank =  2.36706e+06
+Grid : Message : 337.945500 s : mflop/s per node =  9.46825e+06
+Grid : Message : 337.945520 s : RF  GiB/s (base 2) =   153914
+Grid : Message : 337.945540 s : mem GiB/s (base 2) =   96196.3
+Grid : Message : 337.980600 s : norm diff   1.07359e-13
+Grid : Message : 337.146303 s : #### Dhop calls report 
+Grid : Message : 337.146310 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 337.146313 s : WilsonFermion5D TotalTime   /Calls        : 4682.02 us
+Grid : Message : 337.146315 s : WilsonFermion5D CommTime    /Calls        : 3157.4 us
+Grid : Message : 337.146317 s : WilsonFermion5D FaceTime    /Calls        : 471.742 us
+Grid : Message : 337.146319 s : WilsonFermion5D ComputeTime1/Calls        : 4.66553 us
+Grid : Message : 337.146324 s : WilsonFermion5D ComputeTime2/Calls        : 1066.42 us
+Grid : Message : 337.146339 s : Average mflops/s per call                : 6.05546e+10
+Grid : Message : 337.146342 s : Average mflops/s per call per rank       : 1.89233e+09
+Grid : Message : 337.146344 s : Average mflops/s per call per node       : 7.56933e+09
+Grid : Message : 337.146347 s : Average mflops/s per call (full)         : 7.70559e+07
+Grid : Message : 337.146349 s : Average mflops/s per call per rank (full): 2.408e+06
+Grid : Message : 337.146352 s : Average mflops/s per call per node (full): 9.63198e+06
+Grid : Message : 337.146354 s : WilsonFermion5D Stencil
+Grid : Message : 337.146355 s : WilsonFermion5D StencilEven
+Grid : Message : 337.146356 s : WilsonFermion5D StencilOdd
+Grid : Message : 337.146357 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 337.146358 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 337.146361 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 392.570148 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 392.570167 s : Called DwDag
+Grid : Message : 392.570168 s : norm dag result 12.0421
+Grid : Message : 392.597817 s : norm dag ref    12.0421
+Grid : Message : 392.613608 s : norm dag diff   7.28475e-14
+Grid : Message : 392.655240 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 393.337360 s : src_e0.499997
+Grid : Message : 393.428755 s : src_o0.500003
+Grid : Message : 393.551540 s : *********************************************************
+Grid : Message : 393.551542 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 393.551543 s : * Vectorising space-time by 8
+Grid : Message : 393.551544 s : * SINGLE precision 
+Grid : Message : 393.551545 s : * Using Overlapped Comms/Compute
+Grid : Message : 393.551546 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 393.551547 s : *********************************************************
+Grid : Message : 531.331972 s : Deo mflop/s =   7.71814e+07
+Grid : Message : 531.331998 s : Deo mflop/s per rank   2.41192e+06
+Grid : Message : 531.332000 s : Deo mflop/s per node   9.64768e+06
+Grid : Message : 531.332003 s : #### Dhop calls report 
+Grid : Message : 531.332008 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 531.332011 s : WilsonFermion5D TotalTime   /Calls        : 4592.41 us
+Grid : Message : 531.332015 s : WilsonFermion5D CommTime    /Calls        : 3024.25 us
+Grid : Message : 531.332017 s : WilsonFermion5D FaceTime    /Calls        : 573.717 us
+Grid : Message : 531.332020 s : WilsonFermion5D ComputeTime1/Calls        : 5.83554 us
+Grid : Message : 531.332023 s : WilsonFermion5D ComputeTime2/Calls        : 1021.88 us
+Grid : Message : 531.332042 s : Average mflops/s per call                : 5.06597e+10
+Grid : Message : 531.332046 s : Average mflops/s per call per rank       : 1.58312e+09
+Grid : Message : 531.332048 s : Average mflops/s per call per node       : 6.33246e+09
+Grid : Message : 531.332050 s : Average mflops/s per call (full)         : 7.85594e+07
+Grid : Message : 531.332053 s : Average mflops/s per call per rank (full): 2.45498e+06
+Grid : Message : 531.332055 s : Average mflops/s per call per node (full): 9.81993e+06
+Grid : Message : 531.332057 s : WilsonFermion5D Stencil
+Grid : Message : 531.332059 s : WilsonFermion5D StencilEven
+Grid : Message : 531.332060 s : WilsonFermion5D StencilOdd
+Grid : Message : 531.332063 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 531.332064 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 531.332065 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 531.403155 s : r_e6.02113
+Grid : Message : 531.411033 s : r_o6.02101
+Grid : Message : 531.417417 s : res12.0421
+Grid : Message : 532.503110 s : norm diff   0
+Grid : Message : 532.830118 s : norm diff even  0
+Grid : Message : 533.297755 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1110
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1110
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:22:53 BST 2022
+epoch 1661221373
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1110.64708/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:44:16 BST 2022
+epoch 1661222656
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc85c3e000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001496f34e3000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001496f311b000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001496f2c29000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001496f28ff000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001496f261e000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001496f23bd000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001496f346a000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001496f1fdd000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001496f0881000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001496f04b1000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001496f0210000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001496f00e5000)
+	libm.so.6 => /lib64/libm.so.6 (0x00001496efd63000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001496efb2c000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001496ef914000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001496ef6f4000)
+	libc.so.6 => /lib64/libc.so.6 (0x00001496ef32f000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x00001496ef12b000)
+	/lib64/ld-linux-x86-64.so.2 (0x00001496f3333000)
+	librt.so.1 => /lib64/librt.so.1 (0x00001496eef23000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001496f339e000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001496f3399000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001496eee17000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001496eec0d000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x00001496eea09000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/log
@ -0,0 +1,254 @@
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 2 device 0 bus id: 0000:84:00.0
+local rank 1 device 0 bus id: 0000:44:00.0
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x153200000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.477679 s : Grid Layout
+Grid : Message : 1.477683 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.477687 s : 	OpenMP threads       : 4
+Grid : Message : 1.477689 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.518431 s : Making s innermost grids
+Grid : Message : 1.578744 s : Initialising 4d RNG
+Grid : Message : 1.670962 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.670985 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.479199 s : Initialising 5d RNG
+Grid : Message : 3.928882 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 3.928918 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.621348 s : Initialised RNGs
+Grid : Message : 24.791943 s : Drawing gauge field
+Grid : Message : 25.611789 s : Random gauge initialised 
+Grid : Message : 25.623532 s : Setting up Cshift based reference 
+Grid : Message : 54.459836 s : *****************************************************************
+Grid : Message : 54.459859 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 54.459860 s : *****************************************************************
+Grid : Message : 54.459861 s : *****************************************************************
+Grid : Message : 54.459862 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 54.459863 s : * Vectorising space-time by 8
+Grid : Message : 54.459864 s : * VComplexF size is 64 B
+Grid : Message : 54.459866 s : * SINGLE precision 
+Grid : Message : 54.459868 s : * Using Overlapped Comms/Compute
+Grid : Message : 54.459869 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 54.459870 s : *****************************************************************
+Grid : Message : 56.472989 s : Called warmup
+Grid : Message : 336.723650 s : Called Dw 30000 times in 2.8025e+08 us
+Grid : Message : 336.723702 s : mflop/s =   7.58611e+07
+Grid : Message : 336.723705 s : mflop/s per rank =  2.37066e+06
+Grid : Message : 336.723710 s : mflop/s per node =  9.48264e+06
+Grid : Message : 336.723713 s : RF  GiB/s (base 2) =   154148
+Grid : Message : 336.723716 s : mem GiB/s (base 2) =   96342.5
+Grid : Message : 336.727230 s : norm diff   1.07359e-13
+Grid : Message : 336.775672 s : #### Dhop calls report 
+Grid : Message : 336.775679 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 336.775682 s : WilsonFermion5D TotalTime   /Calls        : 4674.86 us
+Grid : Message : 336.775684 s : WilsonFermion5D CommTime    /Calls        : 3140.23 us
+Grid : Message : 336.775686 s : WilsonFermion5D FaceTime    /Calls        : 472.444 us
+Grid : Message : 336.775688 s : WilsonFermion5D ComputeTime1/Calls        : 5.04048 us
+Grid : Message : 336.775690 s : WilsonFermion5D ComputeTime2/Calls        : 1076.29 us
+Grid : Message : 336.775779 s : Average mflops/s per call                : 6.27024e+10
+Grid : Message : 336.775783 s : Average mflops/s per call per rank       : 1.95945e+09
+Grid : Message : 336.775785 s : Average mflops/s per call per node       : 7.8378e+09
+Grid : Message : 336.775787 s : Average mflops/s per call (full)         : 7.71738e+07
+Grid : Message : 336.775789 s : Average mflops/s per call per rank (full): 2.41168e+06
+Grid : Message : 336.775791 s : Average mflops/s per call per node (full): 9.64673e+06
+Grid : Message : 336.775793 s : WilsonFermion5D Stencil
+Grid : Message : 336.775794 s : WilsonFermion5D StencilEven
+Grid : Message : 336.775795 s : WilsonFermion5D StencilOdd
+Grid : Message : 336.775796 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 336.775797 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 336.775798 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 391.954992 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 391.955015 s : Called DwDag
+Grid : Message : 391.955016 s : norm dag result 12.0421
+Grid : Message : 391.968492 s : norm dag ref    12.0421
+Grid : Message : 391.984269 s : norm dag diff   7.28475e-14
+Grid : Message : 392.244680 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 392.446039 s : src_e0.499997
+Grid : Message : 392.821692 s : src_o0.500003
+Grid : Message : 392.939557 s : *********************************************************
+Grid : Message : 392.939563 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 392.939566 s : * Vectorising space-time by 8
+Grid : Message : 392.939568 s : * SINGLE precision 
+Grid : Message : 392.939570 s : * Using Overlapped Comms/Compute
+Grid : Message : 392.939573 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 392.939575 s : *********************************************************
+Grid : Message : 530.161337 s : Deo mflop/s =   7.74985e+07
+Grid : Message : 530.161369 s : Deo mflop/s per rank   2.42183e+06
+Grid : Message : 530.161371 s : Deo mflop/s per node   9.68731e+06
+Grid : Message : 530.161374 s : #### Dhop calls report 
+Grid : Message : 530.161376 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 530.161378 s : WilsonFermion5D TotalTime   /Calls        : 4573.78 us
+Grid : Message : 530.161380 s : WilsonFermion5D CommTime    /Calls        : 3002.97 us
+Grid : Message : 530.161382 s : WilsonFermion5D FaceTime    /Calls        : 567.519 us
+Grid : Message : 530.161384 s : WilsonFermion5D ComputeTime1/Calls        : 6.14496 us
+Grid : Message : 530.161386 s : WilsonFermion5D ComputeTime2/Calls        : 1030.26 us
+Grid : Message : 530.161413 s : Average mflops/s per call                : 5.0739e+10
+Grid : Message : 530.161418 s : Average mflops/s per call per rank       : 1.58559e+09
+Grid : Message : 530.161420 s : Average mflops/s per call per node       : 6.34237e+09
+Grid : Message : 530.161424 s : Average mflops/s per call (full)         : 7.88794e+07
+Grid : Message : 530.161428 s : Average mflops/s per call per rank (full): 2.46498e+06
+Grid : Message : 530.161432 s : Average mflops/s per call per node (full): 9.85993e+06
+Grid : Message : 530.161436 s : WilsonFermion5D Stencil
+Grid : Message : 530.161438 s : WilsonFermion5D StencilEven
+Grid : Message : 530.161440 s : WilsonFermion5D StencilOdd
+Grid : Message : 530.161442 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 530.161445 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 530.161448 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 530.232671 s : r_e6.02113
+Grid : Message : 530.240193 s : r_o6.02101
+Grid : Message : 530.246528 s : res12.0421
+Grid : Message : 530.976149 s : norm diff   0
+Grid : Message : 531.780007 s : norm diff even  0
+Grid : Message : 532.298753 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/script
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/script
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J power-loc32-8A-1125
+#SBATCH -A dp207
+#SBATCH -t 48:00:00
+#SBATCH --nodes=8
+#SBATCH --ntasks=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --reservation=dc-port1_61
+#SBATCH --qos=reservation
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ 8 -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+spack load sshpass
+
+# application and parameters ###################################################
+app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32'
+opt=('--comms-overlap' '--comms-concurrent')
+par=''
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# GPU frequency control ########################################################
+power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32'
+freq=1125
+
+# set frequency
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
+done
+# start NVIDIA SMI monitoring
+tmp=$(mktemp)
+sleep 1
+coproc nvidia-smi dmon -o DT &> "${tmp}"
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi 2.2.2.4 \
+  --accelerator-threads 8 \
+	--grid 64.64.64.128 \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
+
+# reset GPUS ###################################################################
+# stop monitoring
+kill -INT "${COPROC_PID}"
+
+# make monitoring DB
+${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-8A.db "clock_limit_${freq}"
+
+# reset clocks
+for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
+	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
+done
+################################################################################
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/start-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/start-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:35:17 BST 2022
+epoch 1661222117
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/success
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1125.64714/success
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/app-hash
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/app-hash
@ -0,0 +1 @@
+6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/elf
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/elf
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/end-date
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/end-date
@ -0,0 +1,2 @@
+Tue Aug 23 03:56:38 BST 2022
+epoch 1661223398
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/env
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/env
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/ldd
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/ldd
@ -0,0 +1,26 @@
+	linux-vdso.so.1 (0x00007ffc77ffc000)
+	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015363287b000)
+	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001536324b3000)
+	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000153631fc1000)
+	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000153631c97000)
+	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001536319b6000)
+	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000153631755000)
+	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000153632802000)
+	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000153631375000)
+	libcuda.so.1 => /lib64/libcuda.so.1 (0x000015362fc19000)
+	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015362f849000)
+	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015362f5a8000)
+	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015362f47d000)
+	libm.so.6 => /lib64/libm.so.6 (0x000015362f0fb000)
+	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015362eec4000)
+	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015362ecac000)
+	libpthread.so.0 => /lib64/libpthread.so.0 (0x000015362ea8c000)
+	libc.so.6 => /lib64/libc.so.6 (0x000015362e6c7000)
+	libdl.so.2 => /lib64/libdl.so.2 (0x000015362e4c3000)
+	/lib64/ld-linux-x86-64.so.2 (0x00001536326cb000)
+	librt.so.1 => /lib64/librt.so.1 (0x000015362e2bb000)
+	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000153632736000)
+	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000153632731000)
+	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000015362e1af000)
+	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000015362dfa5000)
+	libutil.so.1 => /lib64/libutil.so.1 (0x000015362dda1000)
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/log
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/log
@ -0,0 +1,254 @@
+tu-c0r1n84 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n75 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n84 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n93 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n72 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n87 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n75 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n93 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n72 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n75 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n72 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n84 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n84 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n87 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n93 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n78 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n78 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n78 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n81 - 0 device=0 binding=--interleave=0,1
+tu-c0r1n81 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n81 - 3 device=3 binding=--interleave=6,7
+tu-c0r1n90 - 1 device=1 binding=--interleave=2,3
+tu-c0r1n90 - 2 device=2 binding=--interleave=4,5
+tu-c0r1n90 - 3 device=3 binding=--interleave=6,7
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+OPENMPI detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 3 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+OPENMPI detected
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+AcceleratorCudaInit: ================================================
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 3 device 0 bus id: 0000:C4:00.0
+local rank 0 device 0 bus id: 0000:03:00.0
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+AcceleratorCudaInit: ================================================
+local rank 1 device 0 bus id: 0000:44:00.0
+local rank 2 device 0 bus id: 0000:84:00.0
+SharedMemoryMpi:  World communicator of size 32
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150380000000 for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 34004218675 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+Grid : Message : 1.525814 s : Grid Layout
+Grid : Message : 1.525817 s : 	Global lattice size  : 64 64 64 128 
+Grid : Message : 1.525823 s : 	OpenMP threads       : 4
+Grid : Message : 1.525825 s : 	MPI tasks            : 2 2 2 4 
+Grid : Message : 1.564141 s : Making s innermost grids
+Grid : Message : 1.612317 s : Initialising 4d RNG
+Grid : Message : 1.709482 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 1.709506 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 2.629751 s : Initialising 5d RNG
+Grid : Message : 4.980840 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 4.981520 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 20.874034 s : Initialised RNGs
+Grid : Message : 25.403080 s : Drawing gauge field
+Grid : Message : 25.963343 s : Random gauge initialised 
+Grid : Message : 25.975665 s : Setting up Cshift based reference 
+Grid : Message : 55.107124 s : *****************************************************************
+Grid : Message : 55.107147 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 55.107149 s : *****************************************************************
+Grid : Message : 55.107150 s : *****************************************************************
+Grid : Message : 55.107151 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 55.107152 s : * Vectorising space-time by 8
+Grid : Message : 55.107153 s : * VComplexF size is 64 B
+Grid : Message : 55.107154 s : * SINGLE precision 
+Grid : Message : 55.107157 s : * Using Overlapped Comms/Compute
+Grid : Message : 55.107158 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 55.107159 s : *****************************************************************
+Grid : Message : 57.115222 s : Called warmup
+Grid : Message : 337.196879 s : Called Dw 30000 times in 2.80081e+08 us
+Grid : Message : 337.196925 s : mflop/s =   7.5907e+07
+Grid : Message : 337.196927 s : mflop/s per rank =  2.37209e+06
+Grid : Message : 337.196929 s : mflop/s per node =  9.48837e+06
+Grid : Message : 337.196931 s : RF  GiB/s (base 2) =   154241
+Grid : Message : 337.196933 s : mem GiB/s (base 2) =   96400.8
+Grid : Message : 337.200448 s : norm diff   1.07359e-13
+Grid : Message : 337.249056 s : #### Dhop calls report 
+Grid : Message : 337.249062 s : WilsonFermion5D Number of DhopEO Calls   : 60002
+Grid : Message : 337.249069 s : WilsonFermion5D TotalTime   /Calls        : 4671.91 us
+Grid : Message : 337.249072 s : WilsonFermion5D CommTime    /Calls        : 3138.67 us
+Grid : Message : 337.249075 s : WilsonFermion5D FaceTime    /Calls        : 467.933 us
+Grid : Message : 337.249078 s : WilsonFermion5D ComputeTime1/Calls        : 4.97537 us
+Grid : Message : 337.249081 s : WilsonFermion5D ComputeTime2/Calls        : 1078.84 us
+Grid : Message : 337.249093 s : Average mflops/s per call                : 6.2791e+10
+Grid : Message : 337.249096 s : Average mflops/s per call per rank       : 1.96222e+09
+Grid : Message : 337.249099 s : Average mflops/s per call per node       : 7.84887e+09
+Grid : Message : 337.249103 s : Average mflops/s per call (full)         : 7.72227e+07
+Grid : Message : 337.249106 s : Average mflops/s per call per rank (full): 2.41321e+06
+Grid : Message : 337.249109 s : Average mflops/s per call per node (full): 9.65284e+06
+Grid : Message : 337.249111 s : WilsonFermion5D Stencil
+Grid : Message : 337.249113 s : WilsonFermion5D StencilEven
+Grid : Message : 337.249115 s : WilsonFermion5D StencilOdd
+Grid : Message : 337.249116 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 337.249118 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 337.249119 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 392.546037 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 392.546061 s : Called DwDag
+Grid : Message : 392.546062 s : norm dag result 12.0421
+Grid : Message : 392.593558 s : norm dag ref    12.0421
+Grid : Message : 392.609258 s : norm dag diff   7.28475e-14
+Grid : Message : 392.657672 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 393.911450 s : src_e0.499997
+Grid : Message : 393.412726 s : src_o0.500003
+Grid : Message : 393.510751 s : *********************************************************
+Grid : Message : 393.510754 s : * Benchmarking DomainWallFermionF::DhopEO                
+Grid : Message : 393.510755 s : * Vectorising space-time by 8
+Grid : Message : 393.510756 s : * SINGLE precision 
+Grid : Message : 393.510757 s : * Using Overlapped Comms/Compute
+Grid : Message : 393.510758 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 393.510759 s : *********************************************************
+Grid : Message : 530.311860 s : Deo mflop/s =   7.77338e+07
+Grid : Message : 530.311887 s : Deo mflop/s per rank   2.42918e+06
+Grid : Message : 530.311889 s : Deo mflop/s per node   9.71673e+06
+Grid : Message : 530.311892 s : #### Dhop calls report 
+Grid : Message : 530.311894 s : WilsonFermion5D Number of DhopEO Calls   : 30001
+Grid : Message : 530.311896 s : WilsonFermion5D TotalTime   /Calls        : 4559.74 us
+Grid : Message : 530.311898 s : WilsonFermion5D CommTime    /Calls        : 2983.56 us
+Grid : Message : 530.311900 s : WilsonFermion5D FaceTime    /Calls        : 561.612 us
+Grid : Message : 530.311902 s : WilsonFermion5D ComputeTime1/Calls        : 6.06806 us
+Grid : Message : 530.311904 s : WilsonFermion5D ComputeTime2/Calls        : 1041.48 us
+Grid : Message : 530.311932 s : Average mflops/s per call                : 5.13843e+10
+Grid : Message : 530.311937 s : Average mflops/s per call per rank       : 1.60576e+09
+Grid : Message : 530.311940 s : Average mflops/s per call per node       : 6.42304e+09
+Grid : Message : 530.311944 s : Average mflops/s per call (full)         : 7.91223e+07
+Grid : Message : 530.311947 s : Average mflops/s per call per rank (full): 2.47257e+06
+Grid : Message : 530.311951 s : Average mflops/s per call per node (full): 9.89028e+06
+Grid : Message : 530.311954 s : WilsonFermion5D Stencil
+Grid : Message : 530.311957 s : WilsonFermion5D StencilEven
+Grid : Message : 530.311958 s : WilsonFermion5D StencilOdd
+Grid : Message : 530.311961 s : WilsonFermion5D Stencil     Reporti()
+Grid : Message : 530.311963 s : WilsonFermion5D StencilEven Reporti()
+Grid : Message : 530.311965 s : WilsonFermion5D StencilOdd  Reporti()
+Grid : Message : 530.389174 s : r_e6.02113
+Grid : Message : 530.397070 s : r_o6.02101
+Grid : Message : 530.403387 s : res12.0421
+Grid : Message : 531.146771 s : norm diff   0
+Grid : Message : 531.837346 s : norm diff even  0
+Grid : Message : 532.217730 s : norm diff odd   0
--- a/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/nodes
+++ b/2-racks/size-loc32/8-nodes/job/power-loc32-8A-1140.64720/nodes
@ -0,0 +1 @@
+tu-c0r1n[72,75,78,81,84,87,90,93]
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/size-loc32/Benchmark_dwf_fp32`