Initial commit

2022-09-07 17:31:28 +01:00
commit ade190016a
8502 changed files with 4552538 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 *.code-workspace
--- a/2-racks/rack-power.db
+++ b/2-racks/rack-power.db
--- a/2-racks/size-C0/16-nodes/.geom
+++ b/2-racks/size-C0/16-nodes/.geom
@ -0,0 +1,5 @@
 nnodes    : 16
 ntasks    : 64
 partition : gpu
 mpi-geom  : 2.2.2.8
 grid-geom : 48.48.48.96
--- a/2-racks/size-C0/16-nodes/cpu-mpi-wrapper.sh
+++ b/2-racks/size-C0/16-nodes/cpu-mpi-wrapper.sh
@ -0,0 +1,13 @@
 #!/usr/bin/env bash
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 numa=${lrank}
 cpus="$(( lrank*16 ))-$(( (lrank+1)*16-1 ))"
 places="$(( lrank*16 )):$(( (lrank+1)*16 ))"
 BINDING="taskset -c ${cpus} numactl -m ${numa}"
 export OMP_PLACES=${places}
 echo "$(hostname) - ${lrank} binding='${BINDING}'"
 ${BINDING} "$@"
--- a/2-racks/size-C0/16-nodes/dwf_fp32.tok
+++ b/2-racks/size-C0/16-nodes/dwf_fp32.tok
@ -0,0 +1 @@
 ../dwf_fp32.tok
--- a/2-racks/size-C0/16-nodes/gpu-mpi-wrapper.sh
+++ b/2-racks/size-C0/16-nodes/gpu-mpi-wrapper.sh
@ -0,0 +1,14 @@
 #!/usr/bin/env bash
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 numa1=$(( 2 * lrank))
 numa2=$(( 2 * lrank + 1 ))
 netdev=mlx5_${lrank}:1
 export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
 export UCX_NET_DEVICES=${netdev}
 BINDING="--interleave=$numa1,$numa2"
 echo "$(hostname) - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING"
 numactl ${BINDING} "$@"
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:25:12 BST 2022
 epoch 1661023512
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffef5f3f000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000015459e0bd000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000015459dcf5000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000015459d803000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000015459d4d9000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000015459d1f8000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000015459cf97000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000015459e044000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000015459cbb7000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000015459b45b000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000015459b08b000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000015459adea000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000015459acbf000)
 	libm.so.6 => /lib64/libm.so.6 (0x000015459a93d000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000015459a706000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000015459a4ee000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000015459a2ce000)
 	libc.so.6 => /lib64/libc.so.6 (0x0000154599f09000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x0000154599d05000)
 	/lib64/ld-linux-x86-64.so.2 (0x000015459df0d000)
 	librt.so.1 => /lib64/librt.so.1 (0x0000154599afd000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000015459df78000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000015459df73000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001545999f1000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001545997e7000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x00001545995e3000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/log
@ -0,0 +1,286 @@
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 1 device 0 bus id: 0000:44:00.0
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 3 device 0 bus id: 0000:C4:00.0
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ea00000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.499143 s : Grid Layout
 Grid : Message : 1.499148 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.499155 s : 	OpenMP threads       : 4
 Grid : Message : 1.499157 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.515541 s : Making s innermost grids
 Grid : Message : 1.532470 s : Initialising 4d RNG
 Grid : Message : 1.550455 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.550491 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.937366 s : Initialising 5d RNG
 Grid : Message : 2.163040 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.163078 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.467109 s : Initialised RNGs
 Grid : Message : 8.261272 s : Drawing gauge field
 Grid : Message : 8.380110 s : Random gauge initialised 
 Grid : Message : 8.388989 s : Setting up Cshift based reference 
 Grid : Message : 13.599668 s : *****************************************************************
 Grid : Message : 13.599694 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.599696 s : *****************************************************************
 Grid : Message : 13.599700 s : *****************************************************************
 Grid : Message : 13.599702 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.599705 s : * Vectorising space-time by 8
 Grid : Message : 13.599708 s : * VComplexF size is 64 B
 Grid : Message : 13.599710 s : * SINGLE precision 
 Grid : Message : 13.599712 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.599716 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.599719 s : *****************************************************************
 Grid : Message : 14.992290 s : Called warmup
 Grid : Message : 104.236264 s : Called Dw 30000 times in 9.01365e+07 us
 Grid : Message : 104.236329 s : mflop/s =   7.46293e+07
 Grid : Message : 104.236331 s : mflop/s per rank =  1.16608e+06
 Grid : Message : 104.236333 s : mflop/s per node =  4.66433e+06
 Grid : Message : 104.236335 s : RF  GiB/s (base 2) =   151645
 Grid : Message : 104.236337 s : mem GiB/s (base 2) =   94778.1
 Grid : Message : 104.236908 s : norm diff   1.05775e-13
 Grid : Message : 104.247209 s : #### Dhop calls report 
 Grid : Message : 104.247215 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 104.247219 s : WilsonFermion5D TotalTime   /Calls        : 1503.52 us
 Grid : Message : 104.247221 s : WilsonFermion5D CommTime    /Calls        : 1054.2 us
 Grid : Message : 104.247223 s : WilsonFermion5D FaceTime    /Calls        : 225.375 us
 Grid : Message : 104.247225 s : WilsonFermion5D ComputeTime1/Calls        : 3.01152 us
 Grid : Message : 104.247227 s : WilsonFermion5D ComputeTime2/Calls        : 236.377 us
 Grid : Message : 104.247294 s : Average mflops/s per call                : 3.59587e+10
 Grid : Message : 104.247300 s : Average mflops/s per call per rank       : 5.61855e+08
 Grid : Message : 104.247303 s : Average mflops/s per call per node       : 2.24742e+09
 Grid : Message : 104.247305 s : Average mflops/s per call (full)         : 7.59233e+07
 Grid : Message : 104.247307 s : Average mflops/s per call per rank (full): 1.1863e+06
 Grid : Message : 104.247309 s : Average mflops/s per call per node (full): 4.7452e+06
 Grid : Message : 104.247311 s : WilsonFermion5D Stencil
 Grid : Message : 104.247312 s : WilsonFermion5D StencilEven
 Grid : Message : 104.247313 s : WilsonFermion5D StencilOdd
 Grid : Message : 104.247314 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 104.247315 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 104.247316 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 112.998074 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 112.998099 s : Called DwDag
 Grid : Message : 112.998100 s : norm dag result 12.0422
 Grid : Message : 113.585000 s : norm dag ref    12.0422
 Grid : Message : 113.380300 s : norm dag diff   7.28899e-14
 Grid : Message : 113.140290 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 113.790730 s : src_e0.5
 Grid : Message : 113.153215 s : src_o0.5
 Grid : Message : 113.170341 s : *********************************************************
 Grid : Message : 113.170346 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 113.170347 s : * Vectorising space-time by 8
 Grid : Message : 113.170353 s : * SINGLE precision 
 Grid : Message : 113.170356 s : * Using Overlapped Comms/Compute
 Grid : Message : 113.170357 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 113.170361 s : *********************************************************
 Grid : Message : 161.702832 s : Deo mflop/s =   6.93159e+07
 Grid : Message : 161.702861 s : Deo mflop/s per rank   1.08306e+06
 Grid : Message : 161.702863 s : Deo mflop/s per node   4.33224e+06
 Grid : Message : 161.702866 s : #### Dhop calls report 
 Grid : Message : 161.702868 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 161.702870 s : WilsonFermion5D TotalTime   /Calls        : 1617.57 us
 Grid : Message : 161.702872 s : WilsonFermion5D CommTime    /Calls        : 1105.14 us
 Grid : Message : 161.702874 s : WilsonFermion5D FaceTime    /Calls        : 294.218 us
 Grid : Message : 161.702876 s : WilsonFermion5D ComputeTime1/Calls        : 4.85114 us
 Grid : Message : 161.702878 s : WilsonFermion5D ComputeTime2/Calls        : 241.569 us
 Grid : Message : 161.702900 s : Average mflops/s per call                : 2.0686e+10
 Grid : Message : 161.702904 s : Average mflops/s per call per rank       : 3.23219e+08
 Grid : Message : 161.702906 s : Average mflops/s per call per node       : 1.29288e+09
 Grid : Message : 161.702908 s : Average mflops/s per call (full)         : 7.05701e+07
 Grid : Message : 161.702912 s : Average mflops/s per call per rank (full): 1.10266e+06
 Grid : Message : 161.702914 s : Average mflops/s per call per node (full): 4.41063e+06
 Grid : Message : 161.702920 s : WilsonFermion5D Stencil
 Grid : Message : 161.702922 s : WilsonFermion5D StencilEven
 Grid : Message : 161.702923 s : WilsonFermion5D StencilOdd
 Grid : Message : 161.702926 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 161.702927 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 161.702928 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 161.722751 s : r_e6.02106
 Grid : Message : 161.724439 s : r_o6.0211
 Grid : Message : 161.725861 s : res12.0422
 Grid : Message : 161.827558 s : norm diff   0
 Grid : Message : 161.972191 s : norm diff even  0
 Grid : Message : 162.433730 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1005
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1005
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:22:21 BST 2022
 epoch 1661023341
--- a/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1005.64059/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:37:35 BST 2022
 epoch 1661024255
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffff456d000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000154c9a375000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000154c99fad000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000154c99abb000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000154c99791000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000154c994b0000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000154c9924f000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000154c9a2fc000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000154c98e6f000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000154c97713000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000154c97343000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000154c970a2000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000154c96f77000)
 	libm.so.6 => /lib64/libm.so.6 (0x0000154c96bf5000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000154c969be000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000154c967a6000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000154c96586000)
 	libc.so.6 => /lib64/libc.so.6 (0x0000154c961c1000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x0000154c95fbd000)
 	/lib64/ld-linux-x86-64.so.2 (0x0000154c9a1c5000)
 	librt.so.1 => /lib64/librt.so.1 (0x0000154c95db5000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000154c9a230000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000154c9a22b000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000154c95ca9000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000154c95a9f000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x0000154c9589b000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/log
@ -0,0 +1,286 @@
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 1 device 0 bus id: 0000:44:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 local rank 3 device 0 bus id: 0000:C4:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 2 device 0 bus id: 0000:84:00.0
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14d8e0000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.312638 s : Grid Layout
 Grid : Message : 1.312643 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.312650 s : 	OpenMP threads       : 4
 Grid : Message : 1.312652 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.327971 s : Making s innermost grids
 Grid : Message : 1.344471 s : Initialising 4d RNG
 Grid : Message : 1.361018 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.361045 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.837887 s : Initialising 5d RNG
 Grid : Message : 2.844490 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.845110 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.428202 s : Initialised RNGs
 Grid : Message : 8.439960 s : Drawing gauge field
 Grid : Message : 8.560999 s : Random gauge initialised 
 Grid : Message : 8.573339 s : Setting up Cshift based reference 
 Grid : Message : 13.695651 s : *****************************************************************
 Grid : Message : 13.695676 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.695677 s : *****************************************************************
 Grid : Message : 13.695678 s : *****************************************************************
 Grid : Message : 13.695679 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.695680 s : * Vectorising space-time by 8
 Grid : Message : 13.695681 s : * VComplexF size is 64 B
 Grid : Message : 13.695682 s : * SINGLE precision 
 Grid : Message : 13.695684 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.695685 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.695686 s : *****************************************************************
 Grid : Message : 14.234933 s : Called warmup
 Grid : Message : 103.428452 s : Called Dw 30000 times in 8.91932e+07 us
 Grid : Message : 103.428517 s : mflop/s =   7.54186e+07
 Grid : Message : 103.428519 s : mflop/s per rank =  1.17842e+06
 Grid : Message : 103.428521 s : mflop/s per node =  4.71366e+06
 Grid : Message : 103.428523 s : RF  GiB/s (base 2) =   153249
 Grid : Message : 103.428525 s : mem GiB/s (base 2) =   95780.5
 Grid : Message : 103.429097 s : norm diff   1.05775e-13
 Grid : Message : 103.439111 s : #### Dhop calls report 
 Grid : Message : 103.439118 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 103.439122 s : WilsonFermion5D TotalTime   /Calls        : 1487.69 us
 Grid : Message : 103.439124 s : WilsonFermion5D CommTime    /Calls        : 1041.46 us
 Grid : Message : 103.439126 s : WilsonFermion5D FaceTime    /Calls        : 222.459 us
 Grid : Message : 103.439128 s : WilsonFermion5D ComputeTime1/Calls        : 2.85969 us
 Grid : Message : 103.439130 s : WilsonFermion5D ComputeTime2/Calls        : 236.325 us
 Grid : Message : 103.439201 s : Average mflops/s per call                : 3.60313e+10
 Grid : Message : 103.439207 s : Average mflops/s per call per rank       : 5.62989e+08
 Grid : Message : 103.439209 s : Average mflops/s per call per node       : 2.25196e+09
 Grid : Message : 103.439211 s : Average mflops/s per call (full)         : 7.67311e+07
 Grid : Message : 103.439213 s : Average mflops/s per call per rank (full): 1.19892e+06
 Grid : Message : 103.439215 s : Average mflops/s per call per node (full): 4.7957e+06
 Grid : Message : 103.439217 s : WilsonFermion5D Stencil
 Grid : Message : 103.439218 s : WilsonFermion5D StencilEven
 Grid : Message : 103.439219 s : WilsonFermion5D StencilOdd
 Grid : Message : 103.439220 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 103.439221 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 103.439222 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 112.177904 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 112.177939 s : Called DwDag
 Grid : Message : 112.177940 s : norm dag result 12.0422
 Grid : Message : 112.186235 s : norm dag ref    12.0422
 Grid : Message : 112.189309 s : norm dag diff   7.28899e-14
 Grid : Message : 112.200523 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 112.263704 s : src_e0.5
 Grid : Message : 112.335429 s : src_o0.5
 Grid : Message : 112.352238 s : *********************************************************
 Grid : Message : 112.352244 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 112.352246 s : * Vectorising space-time by 8
 Grid : Message : 112.352248 s : * SINGLE precision 
 Grid : Message : 112.352250 s : * Using Overlapped Comms/Compute
 Grid : Message : 112.352253 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 112.352254 s : *********************************************************
 Grid : Message : 160.328889 s : Deo mflop/s =   7.01193e+07
 Grid : Message : 160.328922 s : Deo mflop/s per rank   1.09561e+06
 Grid : Message : 160.328924 s : Deo mflop/s per node   4.38246e+06
 Grid : Message : 160.328927 s : #### Dhop calls report 
 Grid : Message : 160.328929 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 160.328931 s : WilsonFermion5D TotalTime   /Calls        : 1599.04 us
 Grid : Message : 160.328933 s : WilsonFermion5D CommTime    /Calls        : 1088.05 us
 Grid : Message : 160.328935 s : WilsonFermion5D FaceTime    /Calls        : 294.436 us
 Grid : Message : 160.328937 s : WilsonFermion5D ComputeTime1/Calls        : 4.78577 us
 Grid : Message : 160.328939 s : WilsonFermion5D ComputeTime2/Calls        : 241.411 us
 Grid : Message : 160.328966 s : Average mflops/s per call                : 2.07599e+10
 Grid : Message : 160.328971 s : Average mflops/s per call per rank       : 3.24373e+08
 Grid : Message : 160.328975 s : Average mflops/s per call per node       : 1.29749e+09
 Grid : Message : 160.328980 s : Average mflops/s per call (full)         : 7.13878e+07
 Grid : Message : 160.328983 s : Average mflops/s per call per rank (full): 1.11543e+06
 Grid : Message : 160.328987 s : Average mflops/s per call per node (full): 4.46174e+06
 Grid : Message : 160.328989 s : WilsonFermion5D Stencil
 Grid : Message : 160.328990 s : WilsonFermion5D StencilEven
 Grid : Message : 160.328992 s : WilsonFermion5D StencilOdd
 Grid : Message : 160.328995 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 160.328997 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 160.329000 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 160.348014 s : r_e6.02106
 Grid : Message : 160.350033 s : r_o6.0211
 Grid : Message : 160.351497 s : res12.0422
 Grid : Message : 160.466811 s : norm diff   0
 Grid : Message : 160.599190 s : norm diff even  0
 Grid : Message : 160.669838 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1020
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1020
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:34:46 BST 2022
 epoch 1661024086
--- a/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1020.64063/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:43:25 BST 2022
 epoch 1661024605
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffd625a8000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ff21a6a000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ff216a2000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ff211b0000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ff20e86000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ff20ba5000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ff20944000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ff219f1000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ff20564000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ff1ee08000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ff1ea38000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ff1e797000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ff1e66c000)
 	libm.so.6 => /lib64/libm.so.6 (0x000014ff1e2ea000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ff1e0b3000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ff1de9b000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ff1dc7b000)
 	libc.so.6 => /lib64/libc.so.6 (0x000014ff1d8b6000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x000014ff1d6b2000)
 	/lib64/ld-linux-x86-64.so.2 (0x000014ff218ba000)
 	librt.so.1 => /lib64/librt.so.1 (0x000014ff1d4aa000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ff21925000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ff21920000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ff1d39e000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ff1d194000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x000014ff1cf90000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/log
@ -0,0 +1,286 @@
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 1 device 0 bus id: 0000:44:00.0
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 3 device 0 bus id: 0000:C4:00.0
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146a80000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.623478 s : Grid Layout
 Grid : Message : 1.623482 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.623486 s : 	OpenMP threads       : 4
 Grid : Message : 1.623488 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.637678 s : Making s innermost grids
 Grid : Message : 1.654638 s : Initialising 4d RNG
 Grid : Message : 1.670417 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.670443 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.165386 s : Initialising 5d RNG
 Grid : Message : 2.399472 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.399504 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.787095 s : Initialised RNGs
 Grid : Message : 8.568006 s : Drawing gauge field
 Grid : Message : 8.661012 s : Random gauge initialised 
 Grid : Message : 8.665024 s : Setting up Cshift based reference 
 Grid : Message : 13.760660 s : *****************************************************************
 Grid : Message : 13.760685 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.760687 s : *****************************************************************
 Grid : Message : 13.760690 s : *****************************************************************
 Grid : Message : 13.760691 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.760692 s : * Vectorising space-time by 8
 Grid : Message : 13.760694 s : * VComplexF size is 64 B
 Grid : Message : 13.760696 s : * SINGLE precision 
 Grid : Message : 13.760697 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.760698 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.760700 s : *****************************************************************
 Grid : Message : 14.326353 s : Called warmup
 Grid : Message : 102.469231 s : Called Dw 30000 times in 8.81428e+07 us
 Grid : Message : 102.469296 s : mflop/s =   7.63173e+07
 Grid : Message : 102.469299 s : mflop/s per rank =  1.19246e+06
 Grid : Message : 102.469307 s : mflop/s per node =  4.76983e+06
 Grid : Message : 102.469310 s : RF  GiB/s (base 2) =   155075
 Grid : Message : 102.469313 s : mem GiB/s (base 2) =   96921.9
 Grid : Message : 102.469886 s : norm diff   1.05775e-13
 Grid : Message : 102.480527 s : #### Dhop calls report 
 Grid : Message : 102.480534 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 102.480538 s : WilsonFermion5D TotalTime   /Calls        : 1470.47 us
 Grid : Message : 102.480540 s : WilsonFermion5D CommTime    /Calls        : 1029.89 us
 Grid : Message : 102.480542 s : WilsonFermion5D FaceTime    /Calls        : 217.938 us
 Grid : Message : 102.480544 s : WilsonFermion5D ComputeTime1/Calls        : 3.09645 us
 Grid : Message : 102.480546 s : WilsonFermion5D ComputeTime2/Calls        : 235.402 us
 Grid : Message : 102.480575 s : Average mflops/s per call                : 3.61099e+10
 Grid : Message : 102.480579 s : Average mflops/s per call per rank       : 5.64217e+08
 Grid : Message : 102.480581 s : Average mflops/s per call per node       : 2.25687e+09
 Grid : Message : 102.480583 s : Average mflops/s per call (full)         : 7.76299e+07
 Grid : Message : 102.480587 s : Average mflops/s per call per rank (full): 1.21297e+06
 Grid : Message : 102.480590 s : Average mflops/s per call per node (full): 4.85187e+06
 Grid : Message : 102.480593 s : WilsonFermion5D Stencil
 Grid : Message : 102.480596 s : WilsonFermion5D StencilEven
 Grid : Message : 102.480598 s : WilsonFermion5D StencilOdd
 Grid : Message : 102.480600 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 102.480603 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 102.480605 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 111.202302 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 111.202331 s : Called DwDag
 Grid : Message : 111.202332 s : norm dag result 12.0422
 Grid : Message : 111.204652 s : norm dag ref    12.0422
 Grid : Message : 111.207748 s : norm dag diff   7.28899e-14
 Grid : Message : 111.218376 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 111.273653 s : src_e0.5
 Grid : Message : 111.352934 s : src_o0.5
 Grid : Message : 111.369965 s : *********************************************************
 Grid : Message : 111.369970 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 111.369974 s : * Vectorising space-time by 8
 Grid : Message : 111.369976 s : * SINGLE precision 
 Grid : Message : 111.369977 s : * Using Overlapped Comms/Compute
 Grid : Message : 111.369981 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 111.369983 s : *********************************************************
 Grid : Message : 158.806725 s : Deo mflop/s =   7.09164e+07
 Grid : Message : 158.806755 s : Deo mflop/s per rank   1.10807e+06
 Grid : Message : 158.806757 s : Deo mflop/s per node   4.43227e+06
 Grid : Message : 158.806760 s : #### Dhop calls report 
 Grid : Message : 158.806762 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 158.806764 s : WilsonFermion5D TotalTime   /Calls        : 1581.06 us
 Grid : Message : 158.806766 s : WilsonFermion5D CommTime    /Calls        : 1077.77 us
 Grid : Message : 158.806768 s : WilsonFermion5D FaceTime    /Calls        : 286.721 us
 Grid : Message : 158.806770 s : WilsonFermion5D ComputeTime1/Calls        : 4.98297 us
 Grid : Message : 158.806772 s : WilsonFermion5D ComputeTime2/Calls        : 240.035 us
 Grid : Message : 158.806792 s : Average mflops/s per call                : 2.0753e+10
 Grid : Message : 158.806796 s : Average mflops/s per call per rank       : 3.24266e+08
 Grid : Message : 158.806798 s : Average mflops/s per call per node       : 1.29706e+09
 Grid : Message : 158.806800 s : Average mflops/s per call (full)         : 7.21996e+07
 Grid : Message : 158.806804 s : Average mflops/s per call per rank (full): 1.12812e+06
 Grid : Message : 158.806807 s : Average mflops/s per call per node (full): 4.51247e+06
 Grid : Message : 158.806809 s : WilsonFermion5D Stencil
 Grid : Message : 158.806810 s : WilsonFermion5D StencilEven
 Grid : Message : 158.806812 s : WilsonFermion5D StencilOdd
 Grid : Message : 158.806814 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 158.806816 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 158.806818 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 158.823821 s : r_e6.02106
 Grid : Message : 158.827207 s : r_o6.0211
 Grid : Message : 158.828617 s : res12.0422
 Grid : Message : 158.938772 s : norm diff   0
 Grid : Message : 159.724700 s : norm diff even  0
 Grid : Message : 159.148761 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1035
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1035
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:40:36 BST 2022
 epoch 1661024436
--- a/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1035.64067/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:49:15 BST 2022
 epoch 1661024955
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffe2b5fb000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x00001470cbce5000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x00001470cb91d000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x00001470cb42b000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x00001470cb101000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x00001470cae20000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x00001470cabbf000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x00001470cbc6c000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x00001470ca7df000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x00001470c9083000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x00001470c8cb3000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x00001470c8a12000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x00001470c88e7000)
 	libm.so.6 => /lib64/libm.so.6 (0x00001470c8565000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x00001470c832e000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x00001470c8116000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x00001470c7ef6000)
 	libc.so.6 => /lib64/libc.so.6 (0x00001470c7b31000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x00001470c792d000)
 	/lib64/ld-linux-x86-64.so.2 (0x00001470cbb35000)
 	librt.so.1 => /lib64/librt.so.1 (0x00001470c7725000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x00001470cbba0000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x00001470cbb9b000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x00001470c7619000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x00001470c740f000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x00001470c720b000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/log
@ -0,0 +1,286 @@
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 3 device 0 bus id: 0000:C4:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 local rank 1 device 0 bus id: 0000:44:00.0
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14f600000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.720184 s : Grid Layout
 Grid : Message : 1.720188 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.720196 s : 	OpenMP threads       : 4
 Grid : Message : 1.720199 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.735275 s : Making s innermost grids
 Grid : Message : 1.752323 s : Initialising 4d RNG
 Grid : Message : 1.768478 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.768504 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.201838 s : Initialising 5d RNG
 Grid : Message : 2.438683 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.438714 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.906459 s : Initialised RNGs
 Grid : Message : 8.718015 s : Drawing gauge field
 Grid : Message : 8.851801 s : Random gauge initialised 
 Grid : Message : 8.862438 s : Setting up Cshift based reference 
 Grid : Message : 13.896599 s : *****************************************************************
 Grid : Message : 13.896621 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.896622 s : *****************************************************************
 Grid : Message : 13.896623 s : *****************************************************************
 Grid : Message : 13.896624 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.896625 s : * Vectorising space-time by 8
 Grid : Message : 13.896626 s : * VComplexF size is 64 B
 Grid : Message : 13.896627 s : * SINGLE precision 
 Grid : Message : 13.896628 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.896629 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.896630 s : *****************************************************************
 Grid : Message : 14.428387 s : Called warmup
 Grid : Message : 101.915473 s : Called Dw 30000 times in 8.74869e+07 us
 Grid : Message : 101.915527 s : mflop/s =   7.68895e+07
 Grid : Message : 101.915529 s : mflop/s per rank =  1.2014e+06
 Grid : Message : 101.915531 s : mflop/s per node =  4.80559e+06
 Grid : Message : 101.915533 s : RF  GiB/s (base 2) =   156238
 Grid : Message : 101.915535 s : mem GiB/s (base 2) =   97648.5
 Grid : Message : 101.916107 s : norm diff   1.05775e-13
 Grid : Message : 101.926218 s : #### Dhop calls report 
 Grid : Message : 101.926225 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 101.926228 s : WilsonFermion5D TotalTime   /Calls        : 1459.21 us
 Grid : Message : 101.926230 s : WilsonFermion5D CommTime    /Calls        : 1016.78 us
 Grid : Message : 101.926232 s : WilsonFermion5D FaceTime    /Calls        : 219.506 us
 Grid : Message : 101.926234 s : WilsonFermion5D ComputeTime1/Calls        : 2.78512 us
 Grid : Message : 101.926236 s : WilsonFermion5D ComputeTime2/Calls        : 235.25 us
 Grid : Message : 101.926330 s : Average mflops/s per call                : 3.60206e+10
 Grid : Message : 101.926334 s : Average mflops/s per call per rank       : 5.62822e+08
 Grid : Message : 101.926336 s : Average mflops/s per call per node       : 2.25129e+09
 Grid : Message : 101.926338 s : Average mflops/s per call (full)         : 7.82287e+07
 Grid : Message : 101.926340 s : Average mflops/s per call per rank (full): 1.22232e+06
 Grid : Message : 101.926342 s : Average mflops/s per call per node (full): 4.88929e+06
 Grid : Message : 101.926344 s : WilsonFermion5D Stencil
 Grid : Message : 101.926345 s : WilsonFermion5D StencilEven
 Grid : Message : 101.926346 s : WilsonFermion5D StencilOdd
 Grid : Message : 101.926347 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 101.926348 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 101.926349 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 110.616405 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 110.616430 s : Called DwDag
 Grid : Message : 110.616431 s : norm dag result 12.0422
 Grid : Message : 110.621134 s : norm dag ref    12.0422
 Grid : Message : 110.624323 s : norm dag diff   7.28899e-14
 Grid : Message : 110.637247 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 110.698940 s : src_e0.5
 Grid : Message : 110.766761 s : src_o0.5
 Grid : Message : 110.783307 s : *********************************************************
 Grid : Message : 110.783311 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 110.783313 s : * Vectorising space-time by 8
 Grid : Message : 110.783315 s : * SINGLE precision 
 Grid : Message : 110.783316 s : * Using Overlapped Comms/Compute
 Grid : Message : 110.783317 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 110.783318 s : *********************************************************
 Grid : Message : 157.764942 s : Deo mflop/s =   7.16075e+07
 Grid : Message : 157.764976 s : Deo mflop/s per rank   1.11887e+06
 Grid : Message : 157.764978 s : Deo mflop/s per node   4.47547e+06
 Grid : Message : 157.764981 s : #### Dhop calls report 
 Grid : Message : 157.764983 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 157.764985 s : WilsonFermion5D TotalTime   /Calls        : 1565.89 us
 Grid : Message : 157.764987 s : WilsonFermion5D CommTime    /Calls        : 1058.27 us
 Grid : Message : 157.764989 s : WilsonFermion5D FaceTime    /Calls        : 292.487 us
 Grid : Message : 157.764991 s : WilsonFermion5D ComputeTime1/Calls        : 4.72584 us
 Grid : Message : 157.764993 s : WilsonFermion5D ComputeTime2/Calls        : 239.678 us
 Grid : Message : 157.765020 s : Average mflops/s per call                : 2.07994e+10
 Grid : Message : 157.765024 s : Average mflops/s per call per rank       : 3.2499e+08
 Grid : Message : 157.765027 s : Average mflops/s per call per node       : 1.29996e+09
 Grid : Message : 157.765031 s : Average mflops/s per call (full)         : 7.28994e+07
 Grid : Message : 157.765035 s : Average mflops/s per call per rank (full): 1.13905e+06
 Grid : Message : 157.765039 s : Average mflops/s per call per node (full): 4.55621e+06
 Grid : Message : 157.765042 s : WilsonFermion5D Stencil
 Grid : Message : 157.765044 s : WilsonFermion5D StencilEven
 Grid : Message : 157.765046 s : WilsonFermion5D StencilOdd
 Grid : Message : 157.765049 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 157.765051 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 157.765053 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 157.783731 s : r_e6.02106
 Grid : Message : 157.786036 s : r_o6.0211
 Grid : Message : 157.787470 s : res12.0422
 Grid : Message : 157.905573 s : norm diff   0
 Grid : Message : 158.337590 s : norm diff even  0
 Grid : Message : 158.959010 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1050
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1050
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:46:27 BST 2022
 epoch 1661024788
--- a/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1050.64071/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:55:03 BST 2022
 epoch 1661025303
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffd9b1d1000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014a2805dc000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014a280214000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014a27fd22000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014a27f9f8000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014a27f717000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014a27f4b6000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014a280563000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014a27f0d6000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014a27d97a000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014a27d5aa000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014a27d309000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014a27d1de000)
 	libm.so.6 => /lib64/libm.so.6 (0x000014a27ce5c000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014a27cc25000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014a27ca0d000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a27c7ed000)
 	libc.so.6 => /lib64/libc.so.6 (0x000014a27c428000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x000014a27c224000)
 	/lib64/ld-linux-x86-64.so.2 (0x000014a28042c000)
 	librt.so.1 => /lib64/librt.so.1 (0x000014a27c01c000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014a280497000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014a280492000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014a27bf10000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014a27bd06000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x000014a27bb02000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/log
@ -0,0 +1,286 @@
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 2 device 0 bus id: 0000:84:00.0
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 local rank 3 device 0 bus id: 0000:C4:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 1 device 0 bus id: 0000:44:00.0
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x150120000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.428183 s : Grid Layout
 Grid : Message : 1.428187 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.428193 s : 	OpenMP threads       : 4
 Grid : Message : 1.428196 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.443217 s : Making s innermost grids
 Grid : Message : 1.455165 s : Initialising 4d RNG
 Grid : Message : 1.471981 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.472007 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.853366 s : Initialising 5d RNG
 Grid : Message : 2.875960 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.876470 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.305707 s : Initialised RNGs
 Grid : Message : 8.397843 s : Drawing gauge field
 Grid : Message : 8.484443 s : Random gauge initialised 
 Grid : Message : 8.488387 s : Setting up Cshift based reference 
 Grid : Message : 13.563627 s : *****************************************************************
 Grid : Message : 13.563653 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.563655 s : *****************************************************************
 Grid : Message : 13.563658 s : *****************************************************************
 Grid : Message : 13.563659 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.563660 s : * Vectorising space-time by 8
 Grid : Message : 13.563663 s : * VComplexF size is 64 B
 Grid : Message : 13.563665 s : * SINGLE precision 
 Grid : Message : 13.563667 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.563668 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.563669 s : *****************************************************************
 Grid : Message : 14.958310 s : Called warmup
 Grid : Message : 101.445133 s : Called Dw 30000 times in 8.73489e+07 us
 Grid : Message : 101.445198 s : mflop/s =   7.7011e+07
 Grid : Message : 101.445200 s : mflop/s per rank =  1.2033e+06
 Grid : Message : 101.445202 s : mflop/s per node =  4.81319e+06
 Grid : Message : 101.445204 s : RF  GiB/s (base 2) =   156485
 Grid : Message : 101.445206 s : mem GiB/s (base 2) =   97802.9
 Grid : Message : 101.445777 s : norm diff   1.05775e-13
 Grid : Message : 101.455931 s : #### Dhop calls report 
 Grid : Message : 101.455939 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 101.455943 s : WilsonFermion5D TotalTime   /Calls        : 1457.12 us
 Grid : Message : 101.455945 s : WilsonFermion5D CommTime    /Calls        : 1014.92 us
 Grid : Message : 101.455947 s : WilsonFermion5D FaceTime    /Calls        : 219.441 us
 Grid : Message : 101.455949 s : WilsonFermion5D ComputeTime1/Calls        : 2.84344 us
 Grid : Message : 101.455951 s : WilsonFermion5D ComputeTime2/Calls        : 235.367 us
 Grid : Message : 101.455978 s : Average mflops/s per call                : 3.61947e+10
 Grid : Message : 101.455982 s : Average mflops/s per call per rank       : 5.65543e+08
 Grid : Message : 101.455984 s : Average mflops/s per call per node       : 2.26217e+09
 Grid : Message : 101.455986 s : Average mflops/s per call (full)         : 7.83407e+07
 Grid : Message : 101.455990 s : Average mflops/s per call per rank (full): 1.22407e+06
 Grid : Message : 101.455992 s : Average mflops/s per call per node (full): 4.8963e+06
 Grid : Message : 101.455994 s : WilsonFermion5D Stencil
 Grid : Message : 101.455995 s : WilsonFermion5D StencilEven
 Grid : Message : 101.455999 s : WilsonFermion5D StencilOdd
 Grid : Message : 101.456001 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 101.456002 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 101.456004 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 110.188024 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 110.188051 s : Called DwDag
 Grid : Message : 110.188052 s : norm dag result 12.0422
 Grid : Message : 110.200211 s : norm dag ref    12.0422
 Grid : Message : 110.203215 s : norm dag diff   7.28899e-14
 Grid : Message : 110.213199 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 110.281787 s : src_e0.5
 Grid : Message : 110.353808 s : src_o0.5
 Grid : Message : 110.370985 s : *********************************************************
 Grid : Message : 110.370991 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 110.370992 s : * Vectorising space-time by 8
 Grid : Message : 110.370995 s : * SINGLE precision 
 Grid : Message : 110.370997 s : * Using Overlapped Comms/Compute
 Grid : Message : 110.370998 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 110.371000 s : *********************************************************
 Grid : Message : 157.314519 s : Deo mflop/s =   7.16631e+07
 Grid : Message : 157.314545 s : Deo mflop/s per rank   1.11974e+06
 Grid : Message : 157.314547 s : Deo mflop/s per node   4.47894e+06
 Grid : Message : 157.314550 s : #### Dhop calls report 
 Grid : Message : 157.314552 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 157.314554 s : WilsonFermion5D TotalTime   /Calls        : 1564.64 us
 Grid : Message : 157.314556 s : WilsonFermion5D CommTime    /Calls        : 1060.37 us
 Grid : Message : 157.314558 s : WilsonFermion5D FaceTime    /Calls        : 287.98 us
 Grid : Message : 157.314560 s : WilsonFermion5D ComputeTime1/Calls        : 4.91794 us
 Grid : Message : 157.314562 s : WilsonFermion5D ComputeTime2/Calls        : 239.551 us
 Grid : Message : 157.314587 s : Average mflops/s per call                : 2.07265e+10
 Grid : Message : 157.314591 s : Average mflops/s per call per rank       : 3.23852e+08
 Grid : Message : 157.314593 s : Average mflops/s per call per node       : 1.29541e+09
 Grid : Message : 157.314596 s : Average mflops/s per call (full)         : 7.29577e+07
 Grid : Message : 157.314600 s : Average mflops/s per call per rank (full): 1.13996e+06
 Grid : Message : 157.314602 s : Average mflops/s per call per node (full): 4.55985e+06
 Grid : Message : 157.314605 s : WilsonFermion5D Stencil
 Grid : Message : 157.314606 s : WilsonFermion5D StencilEven
 Grid : Message : 157.314608 s : WilsonFermion5D StencilOdd
 Grid : Message : 157.314610 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 157.314613 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 157.314614 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 157.334523 s : r_e6.02106
 Grid : Message : 157.336050 s : r_o6.0211
 Grid : Message : 157.337424 s : res12.0422
 Grid : Message : 157.450236 s : norm diff   0
 Grid : Message : 157.586163 s : norm diff even  0
 Grid : Message : 157.657558 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1065
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1065
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:52:16 BST 2022
 epoch 1661025136
--- a/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1065.64076/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:00:52 BST 2022
 epoch 1661025652
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffceffcb000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014c73048f000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014c7300c7000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014c72fbd5000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014c72f8ab000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014c72f5ca000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014c72f369000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014c730416000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014c72ef89000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014c72d82d000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014c72d45d000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014c72d1bc000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014c72d091000)
 	libm.so.6 => /lib64/libm.so.6 (0x000014c72cd0f000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014c72cad8000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014c72c8c0000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014c72c6a0000)
 	libc.so.6 => /lib64/libc.so.6 (0x000014c72c2db000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x000014c72c0d7000)
 	/lib64/ld-linux-x86-64.so.2 (0x000014c7302df000)
 	librt.so.1 => /lib64/librt.so.1 (0x000014c72becf000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014c73034a000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014c730345000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014c72bdc3000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014c72bbb9000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x000014c72b9b5000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/log
@ -0,0 +1,286 @@
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 1 device 0 bus id: 0000:44:00.0
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 3 device 0 bus id: 0000:C4:00.0
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1548a0000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.498999 s : Grid Layout
 Grid : Message : 1.499003 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.499009 s : 	OpenMP threads       : 4
 Grid : Message : 1.499010 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.516697 s : Making s innermost grids
 Grid : Message : 1.528026 s : Initialising 4d RNG
 Grid : Message : 1.543296 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.543322 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.803104 s : Initialising 5d RNG
 Grid : Message : 2.280210 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.280810 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.463560 s : Initialised RNGs
 Grid : Message : 8.316566 s : Drawing gauge field
 Grid : Message : 8.441882 s : Random gauge initialised 
 Grid : Message : 8.454498 s : Setting up Cshift based reference 
 Grid : Message : 13.615874 s : *****************************************************************
 Grid : Message : 13.615901 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.615903 s : *****************************************************************
 Grid : Message : 13.615904 s : *****************************************************************
 Grid : Message : 13.615905 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.615906 s : * Vectorising space-time by 8
 Grid : Message : 13.615910 s : * VComplexF size is 64 B
 Grid : Message : 13.615912 s : * SINGLE precision 
 Grid : Message : 13.615914 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.615916 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.615918 s : *****************************************************************
 Grid : Message : 14.175758 s : Called warmup
 Grid : Message : 100.948265 s : Called Dw 30000 times in 8.67724e+07 us
 Grid : Message : 100.948328 s : mflop/s =   7.75226e+07
 Grid : Message : 100.948330 s : mflop/s per rank =  1.21129e+06
 Grid : Message : 100.948332 s : mflop/s per node =  4.84516e+06
 Grid : Message : 100.948334 s : RF  GiB/s (base 2) =   157524
 Grid : Message : 100.948336 s : mem GiB/s (base 2) =   98452.5
 Grid : Message : 100.948912 s : norm diff   1.05775e-13
 Grid : Message : 100.958922 s : #### Dhop calls report 
 Grid : Message : 100.958930 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 100.958934 s : WilsonFermion5D TotalTime   /Calls        : 1447.35 us
 Grid : Message : 100.958936 s : WilsonFermion5D CommTime    /Calls        : 1006.18 us
 Grid : Message : 100.958938 s : WilsonFermion5D FaceTime    /Calls        : 218.625 us
 Grid : Message : 100.958940 s : WilsonFermion5D ComputeTime1/Calls        : 2.6472 us
 Grid : Message : 100.958942 s : WilsonFermion5D ComputeTime2/Calls        : 235.108 us
 Grid : Message : 100.958970 s : Average mflops/s per call                : 3.6261e+10
 Grid : Message : 100.958974 s : Average mflops/s per call per rank       : 5.66578e+08
 Grid : Message : 100.958976 s : Average mflops/s per call per node       : 2.26631e+09
 Grid : Message : 100.958978 s : Average mflops/s per call (full)         : 7.88698e+07
 Grid : Message : 100.958981 s : Average mflops/s per call per rank (full): 1.23234e+06
 Grid : Message : 100.958983 s : Average mflops/s per call per node (full): 4.92936e+06
 Grid : Message : 100.958986 s : WilsonFermion5D Stencil
 Grid : Message : 100.958987 s : WilsonFermion5D StencilEven
 Grid : Message : 100.958988 s : WilsonFermion5D StencilOdd
 Grid : Message : 100.958991 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 100.958992 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 100.958995 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 109.635912 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 109.635940 s : Called DwDag
 Grid : Message : 109.635941 s : norm dag result 12.0422
 Grid : Message : 109.641498 s : norm dag ref    12.0422
 Grid : Message : 109.644623 s : norm dag diff   7.28899e-14
 Grid : Message : 109.654599 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 109.718075 s : src_e0.5
 Grid : Message : 109.790285 s : src_o0.5
 Grid : Message : 109.807211 s : *********************************************************
 Grid : Message : 109.807217 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 109.807219 s : * Vectorising space-time by 8
 Grid : Message : 109.807221 s : * SINGLE precision 
 Grid : Message : 109.807224 s : * Using Overlapped Comms/Compute
 Grid : Message : 109.807225 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 109.807226 s : *********************************************************
 Grid : Message : 156.357075 s : Deo mflop/s =   7.22704e+07
 Grid : Message : 156.357109 s : Deo mflop/s per rank   1.12923e+06
 Grid : Message : 156.357111 s : Deo mflop/s per node   4.5169e+06
 Grid : Message : 156.357114 s : #### Dhop calls report 
 Grid : Message : 156.357116 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 156.357118 s : WilsonFermion5D TotalTime   /Calls        : 1551.51 us
 Grid : Message : 156.357120 s : WilsonFermion5D CommTime    /Calls        : 1049.38 us
 Grid : Message : 156.357122 s : WilsonFermion5D FaceTime    /Calls        : 285.792 us
 Grid : Message : 156.357124 s : WilsonFermion5D ComputeTime1/Calls        : 4.81357 us
 Grid : Message : 156.357126 s : WilsonFermion5D ComputeTime2/Calls        : 239.16 us
 Grid : Message : 156.357146 s : Average mflops/s per call                : 2.07719e+10
 Grid : Message : 156.357150 s : Average mflops/s per call per rank       : 3.24561e+08
 Grid : Message : 156.357152 s : Average mflops/s per call per node       : 1.29824e+09
 Grid : Message : 156.357154 s : Average mflops/s per call (full)         : 7.35747e+07
 Grid : Message : 156.357158 s : Average mflops/s per call per rank (full): 1.1496e+06
 Grid : Message : 156.357161 s : Average mflops/s per call per node (full): 4.59842e+06
 Grid : Message : 156.357163 s : WilsonFermion5D Stencil
 Grid : Message : 156.357165 s : WilsonFermion5D StencilEven
 Grid : Message : 156.357166 s : WilsonFermion5D StencilOdd
 Grid : Message : 156.357168 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 156.357175 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 156.357176 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 156.375718 s : r_e6.02106
 Grid : Message : 156.378883 s : r_o6.0211
 Grid : Message : 156.380335 s : res12.0422
 Grid : Message : 156.489162 s : norm diff   0
 Grid : Message : 156.617774 s : norm diff even  0
 Grid : Message : 156.694536 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1080
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1080
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 20:58:06 BST 2022
 epoch 1661025486
--- a/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1080.64082/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:06:38 BST 2022
 epoch 1661025998
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffc219f0000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014aa89605000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014aa8923d000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014aa88d4b000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014aa88a21000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014aa88740000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014aa884df000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014aa8958c000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014aa880ff000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014aa869a3000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014aa865d3000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014aa86332000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014aa86207000)
 	libm.so.6 => /lib64/libm.so.6 (0x000014aa85e85000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014aa85c4e000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014aa85a36000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014aa85816000)
 	libc.so.6 => /lib64/libc.so.6 (0x000014aa85451000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x000014aa8524d000)
 	/lib64/ld-linux-x86-64.so.2 (0x000014aa89455000)
 	librt.so.1 => /lib64/librt.so.1 (0x000014aa85045000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014aa894c0000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014aa894bb000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014aa84f39000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014aa84d2f000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x000014aa84b2b000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/log
@ -0,0 +1,286 @@
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 3 device 0 bus id: 0000:C4:00.0
 local rank 1 device 0 bus id: 0000:44:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146d00000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.412895 s : Grid Layout
 Grid : Message : 1.412899 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.412905 s : 	OpenMP threads       : 4
 Grid : Message : 1.412909 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.428319 s : Making s innermost grids
 Grid : Message : 1.445373 s : Initialising 4d RNG
 Grid : Message : 1.461658 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.461680 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.902912 s : Initialising 5d RNG
 Grid : Message : 2.141255 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.141291 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.353326 s : Initialised RNGs
 Grid : Message : 8.518633 s : Drawing gauge field
 Grid : Message : 8.626652 s : Random gauge initialised 
 Grid : Message : 8.630634 s : Setting up Cshift based reference 
 Grid : Message : 13.722925 s : *****************************************************************
 Grid : Message : 13.722949 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.722950 s : *****************************************************************
 Grid : Message : 13.722951 s : *****************************************************************
 Grid : Message : 13.722952 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.722953 s : * Vectorising space-time by 8
 Grid : Message : 13.722954 s : * VComplexF size is 64 B
 Grid : Message : 13.722955 s : * SINGLE precision 
 Grid : Message : 13.722956 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.722957 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.722958 s : *****************************************************************
 Grid : Message : 14.254628 s : Called warmup
 Grid : Message : 100.327406 s : Called Dw 30000 times in 8.60725e+07 us
 Grid : Message : 100.327470 s : mflop/s =   7.8153e+07
 Grid : Message : 100.327472 s : mflop/s per rank =  1.22114e+06
 Grid : Message : 100.327474 s : mflop/s per node =  4.88456e+06
 Grid : Message : 100.327476 s : RF  GiB/s (base 2) =   158805
 Grid : Message : 100.327478 s : mem GiB/s (base 2) =   99253.2
 Grid : Message : 100.328051 s : norm diff   1.05775e-13
 Grid : Message : 100.337927 s : #### Dhop calls report 
 Grid : Message : 100.337935 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 100.337943 s : WilsonFermion5D TotalTime   /Calls        : 1435.69 us
 Grid : Message : 100.337946 s : WilsonFermion5D CommTime    /Calls        : 996.547 us
 Grid : Message : 100.337949 s : WilsonFermion5D FaceTime    /Calls        : 217.079 us
 Grid : Message : 100.337953 s : WilsonFermion5D ComputeTime1/Calls        : 2.78067 us
 Grid : Message : 100.337955 s : WilsonFermion5D ComputeTime2/Calls        : 234.472 us
 Grid : Message : 100.337971 s : Average mflops/s per call                : 3.63872e+10
 Grid : Message : 100.337974 s : Average mflops/s per call per rank       : 5.68549e+08
 Grid : Message : 100.337976 s : Average mflops/s per call per node       : 2.2742e+09
 Grid : Message : 100.337980 s : Average mflops/s per call (full)         : 7.95104e+07
 Grid : Message : 100.337982 s : Average mflops/s per call per rank (full): 1.24235e+06
 Grid : Message : 100.337986 s : Average mflops/s per call per node (full): 4.9694e+06
 Grid : Message : 100.337988 s : WilsonFermion5D Stencil
 Grid : Message : 100.337990 s : WilsonFermion5D StencilEven
 Grid : Message : 100.337992 s : WilsonFermion5D StencilOdd
 Grid : Message : 100.337995 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 100.337998 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 100.338000 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 109.354730 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 109.355200 s : Called DwDag
 Grid : Message : 109.355210 s : norm dag result 12.0422
 Grid : Message : 109.404420 s : norm dag ref    12.0422
 Grid : Message : 109.435430 s : norm dag diff   7.28899e-14
 Grid : Message : 109.565940 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 109.123204 s : src_e0.5
 Grid : Message : 109.194082 s : src_o0.5
 Grid : Message : 109.211743 s : *********************************************************
 Grid : Message : 109.211749 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 109.211751 s : * Vectorising space-time by 8
 Grid : Message : 109.211754 s : * SINGLE precision 
 Grid : Message : 109.211756 s : * Using Overlapped Comms/Compute
 Grid : Message : 109.211759 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 109.211761 s : *********************************************************
 Grid : Message : 155.351395 s : Deo mflop/s =   7.29132e+07
 Grid : Message : 155.351424 s : Deo mflop/s per rank   1.13927e+06
 Grid : Message : 155.351427 s : Deo mflop/s per node   4.55708e+06
 Grid : Message : 155.351433 s : #### Dhop calls report 
 Grid : Message : 155.351436 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 155.351440 s : WilsonFermion5D TotalTime   /Calls        : 1537.8 us
 Grid : Message : 155.351445 s : WilsonFermion5D CommTime    /Calls        : 1037.77 us
 Grid : Message : 155.351449 s : WilsonFermion5D FaceTime    /Calls        : 285.044 us
 Grid : Message : 155.351453 s : WilsonFermion5D ComputeTime1/Calls        : 4.8771 us
 Grid : Message : 155.351457 s : WilsonFermion5D ComputeTime2/Calls        : 237.861 us
 Grid : Message : 155.351481 s : Average mflops/s per call                : 2.07287e+10
 Grid : Message : 155.351485 s : Average mflops/s per call per rank       : 3.23886e+08
 Grid : Message : 155.351488 s : Average mflops/s per call per node       : 1.29554e+09
 Grid : Message : 155.351492 s : Average mflops/s per call (full)         : 7.42306e+07
 Grid : Message : 155.351496 s : Average mflops/s per call per rank (full): 1.15985e+06
 Grid : Message : 155.351500 s : Average mflops/s per call per node (full): 4.63942e+06
 Grid : Message : 155.351504 s : WilsonFermion5D Stencil
 Grid : Message : 155.351506 s : WilsonFermion5D StencilEven
 Grid : Message : 155.351508 s : WilsonFermion5D StencilOdd
 Grid : Message : 155.351511 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 155.351513 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 155.351515 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 155.370290 s : r_e6.02106
 Grid : Message : 155.372244 s : r_o6.0211
 Grid : Message : 155.373660 s : res12.0422
 Grid : Message : 155.495172 s : norm diff   0
 Grid : Message : 155.622362 s : norm diff even  0
 Grid : Message : 155.695812 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1095
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1095
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:03:53 BST 2022
 epoch 1661025833
--- a/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1095.64087/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:12:23 BST 2022
 epoch 1661026343
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffdef5db000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x0000152bce209000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x0000152bcde41000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x0000152bcd94f000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x0000152bcd625000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x0000152bcd344000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x0000152bcd0e3000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x0000152bce190000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x0000152bccd03000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x0000152bcb5a7000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x0000152bcb1d7000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x0000152bcaf36000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x0000152bcae0b000)
 	libm.so.6 => /lib64/libm.so.6 (0x0000152bcaa89000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x0000152bca852000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x0000152bca63a000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x0000152bca41a000)
 	libc.so.6 => /lib64/libc.so.6 (0x0000152bca055000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x0000152bc9e51000)
 	/lib64/ld-linux-x86-64.so.2 (0x0000152bce059000)
 	librt.so.1 => /lib64/librt.so.1 (0x0000152bc9c49000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x0000152bce0c4000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x0000152bce0bf000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x0000152bc9b3d000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x0000152bc9933000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x0000152bc972f000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/log
@ -0,0 +1,286 @@
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 1 device 0 bus id: 0000:44:00.0
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 3 device 0 bus id: 0000:C4:00.0
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x147320000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.574553 s : Grid Layout
 Grid : Message : 1.574555 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.574559 s : 	OpenMP threads       : 4
 Grid : Message : 1.574561 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.590560 s : Making s innermost grids
 Grid : Message : 1.602336 s : Initialising 4d RNG
 Grid : Message : 1.619266 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.619291 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.883640 s : Initialising 5d RNG
 Grid : Message : 2.117383 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.117419 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.594282 s : Initialised RNGs
 Grid : Message : 8.809615 s : Drawing gauge field
 Grid : Message : 8.954788 s : Random gauge initialised 
 Grid : Message : 8.965668 s : Setting up Cshift based reference 
 Grid : Message : 13.965128 s : *****************************************************************
 Grid : Message : 13.965152 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.965153 s : *****************************************************************
 Grid : Message : 13.965154 s : *****************************************************************
 Grid : Message : 13.965155 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.965156 s : * Vectorising space-time by 8
 Grid : Message : 13.965157 s : * VComplexF size is 64 B
 Grid : Message : 13.965159 s : * SINGLE precision 
 Grid : Message : 13.965160 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.965161 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.965162 s : *****************************************************************
 Grid : Message : 14.515202 s : Called warmup
 Grid : Message : 99.730150 s : Called Dw 30000 times in 8.52149e+07 us
 Grid : Message : 99.730204 s : mflop/s =   7.89395e+07
 Grid : Message : 99.730206 s : mflop/s per rank =  1.23343e+06
 Grid : Message : 99.730208 s : mflop/s per node =  4.93372e+06
 Grid : Message : 99.730210 s : RF  GiB/s (base 2) =   160403
 Grid : Message : 99.730212 s : mem GiB/s (base 2) =   100252
 Grid : Message : 99.730784 s : norm diff   1.05775e-13
 Grid : Message : 99.740621 s : #### Dhop calls report 
 Grid : Message : 99.740628 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 99.740631 s : WilsonFermion5D TotalTime   /Calls        : 1421.72 us
 Grid : Message : 99.740633 s : WilsonFermion5D CommTime    /Calls        : 984.801 us
 Grid : Message : 99.740635 s : WilsonFermion5D FaceTime    /Calls        : 215.72 us
 Grid : Message : 99.740637 s : WilsonFermion5D ComputeTime1/Calls        : 2.65594 us
 Grid : Message : 99.740639 s : WilsonFermion5D ComputeTime2/Calls        : 233.727 us
 Grid : Message : 99.740655 s : Average mflops/s per call                : 3.59268e+10
 Grid : Message : 99.740658 s : Average mflops/s per call per rank       : 5.61356e+08
 Grid : Message : 99.740660 s : Average mflops/s per call per node       : 2.24542e+09
 Grid : Message : 99.740662 s : Average mflops/s per call (full)         : 8.02916e+07
 Grid : Message : 99.740665 s : Average mflops/s per call per rank (full): 1.25456e+06
 Grid : Message : 99.740667 s : Average mflops/s per call per node (full): 5.01823e+06
 Grid : Message : 99.740669 s : WilsonFermion5D Stencil
 Grid : Message : 99.740670 s : WilsonFermion5D StencilEven
 Grid : Message : 99.740672 s : WilsonFermion5D StencilOdd
 Grid : Message : 99.740673 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 99.740675 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 99.740679 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 108.466783 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 108.466816 s : Called DwDag
 Grid : Message : 108.466817 s : norm dag result 12.0422
 Grid : Message : 108.470193 s : norm dag ref    12.0422
 Grid : Message : 108.473428 s : norm dag diff   7.28899e-14
 Grid : Message : 108.486838 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 108.550312 s : src_e0.5
 Grid : Message : 108.623836 s : src_o0.5
 Grid : Message : 108.640541 s : *********************************************************
 Grid : Message : 108.640545 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 108.640546 s : * Vectorising space-time by 8
 Grid : Message : 108.640548 s : * SINGLE precision 
 Grid : Message : 108.640553 s : * Using Overlapped Comms/Compute
 Grid : Message : 108.640555 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 108.640556 s : *********************************************************
 Grid : Message : 154.233908 s : Deo mflop/s =   7.37872e+07
 Grid : Message : 154.233941 s : Deo mflop/s per rank   1.15293e+06
 Grid : Message : 154.233943 s : Deo mflop/s per node   4.6117e+06
 Grid : Message : 154.233946 s : #### Dhop calls report 
 Grid : Message : 154.233948 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 154.233950 s : WilsonFermion5D TotalTime   /Calls        : 1519.59 us
 Grid : Message : 154.233952 s : WilsonFermion5D CommTime    /Calls        : 1019.64 us
 Grid : Message : 154.233954 s : WilsonFermion5D FaceTime    /Calls        : 288.201 us
 Grid : Message : 154.233956 s : WilsonFermion5D ComputeTime1/Calls        : 4.91837 us
 Grid : Message : 154.233958 s : WilsonFermion5D ComputeTime2/Calls        : 236.348 us
 Grid : Message : 154.233977 s : Average mflops/s per call                : 2.07539e+10
 Grid : Message : 154.233980 s : Average mflops/s per call per rank       : 3.24279e+08
 Grid : Message : 154.233982 s : Average mflops/s per call per node       : 1.29712e+09
 Grid : Message : 154.233984 s : Average mflops/s per call (full)         : 7.51203e+07
 Grid : Message : 154.233986 s : Average mflops/s per call per rank (full): 1.17375e+06
 Grid : Message : 154.233988 s : Average mflops/s per call per node (full): 4.69502e+06
 Grid : Message : 154.233991 s : WilsonFermion5D Stencil
 Grid : Message : 154.233992 s : WilsonFermion5D StencilEven
 Grid : Message : 154.233993 s : WilsonFermion5D StencilOdd
 Grid : Message : 154.233994 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 154.233995 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 154.233996 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 154.253979 s : r_e6.02106
 Grid : Message : 154.255883 s : r_o6.0211
 Grid : Message : 154.257289 s : res12.0422
 Grid : Message : 154.364123 s : norm diff   0
 Grid : Message : 154.496590 s : norm diff even  0
 Grid : Message : 154.572879 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1110
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1110
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:09:39 BST 2022
 epoch 1661026179
--- a/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1110.64091/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:18:10 BST 2022
 epoch 1661026690
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/env
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/ldd
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/ldd
@ -0,0 +1,26 @@
 	linux-vdso.so.1 (0x00007ffe04b26000)
 	libhdf5_cpp.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5_cpp.so.103 (0x000014ffbc78a000)
 	libz.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/zlib-1.2.12-53uezvxyb4cfy6w7etemrp7eykxgg6xm/lib/libz.so.1 (0x000014ffbc3c2000)
 	libcrypto.so.1.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/openssl-1.1.1n-k5pyjw75fnlzm2h4mzlxyg7i25kz5jml/lib/libcrypto.so.1.1 (0x000014ffbbed0000)
 	libfftw3f.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3f.so.3 (0x000014ffbbba6000)
 	libfftw3.so.3 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/fftw-3.3.10-myag3rqpxpydiohkmykiq5l7gjo2f2za/lib/libfftw3.so.3 (0x000014ffbb8c5000)
 	libmpfr.so.4 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/mpfr-3.1.6-qu7rdfgq2vkcyoujezmqdkwk27dfgn23/lib/libmpfr.so.4 (0x000014ffbb664000)
 	libgmp.so.10 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/gmp-6.2.1-4jnnslrfuvireqgjsukajk3hnc5fpmul/lib/libgmp.so.10 (0x000014ffbc711000)
 	libstdc++.so.6 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libstdc++.so.6 (0x000014ffbb284000)
 	libcuda.so.1 => /lib64/libcuda.so.1 (0x000014ffb9b28000)
 	libhdf5.so.103 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/hdf5-1.10.7-v46btwnx3afc6rli2d5i4cjiog25fhbx/lib/libhdf5.so.103 (0x000014ffb9758000)
 	libcudart.so.11.0 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen2/gcc-9.4.0/cuda-11.4.0-etxow4jb23qdbs7j6txczy44cdatpj22/lib64/libcudart.so.11.0 (0x000014ffb94b7000)
 	libmpi.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libmpi.so.40 (0x000014ffb938c000)
 	libm.so.6 => /lib64/libm.so.6 (0x000014ffb900a000)
 	libgomp.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgomp.so.1 (0x000014ffb8dd3000)
 	libgcc_s.so.1 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/gcc-9.4.0-g3vyv3te4ah634euh7phyokb3fiurprp/lib64/libgcc_s.so.1 (0x000014ffb8bbb000)
 	libpthread.so.0 => /lib64/libpthread.so.0 (0x000014ffb899b000)
 	libc.so.6 => /lib64/libc.so.6 (0x000014ffb85d6000)
 	libdl.so.2 => /lib64/libdl.so.2 (0x000014ffb83d2000)
 	/lib64/ld-linux-x86-64.so.2 (0x000014ffbc5da000)
 	librt.so.1 => /lib64/librt.so.1 (0x000014ffb81ca000)
 	libopen-rte.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-rte.so.40 (0x000014ffbc645000)
 	libopen-orted-mpir.so => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-orted-mpir.so (0x000014ffbc640000)
 	libopen-pal.so.40 => /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428/prefix/ompi_gpu/lib/libopen-pal.so.40 (0x000014ffb80be000)
 	libpciaccess.so.0 => /lib64/libpciaccess.so.0 (0x000014ffb7eb4000)
 	libutil.so.1 => /lib64/libutil.so.1 (0x000014ffb7cb0000)
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/log
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/log
@ -0,0 +1,286 @@
 tu-c0r1n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n12 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n15 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n18 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n18 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n18 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n15 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n12 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n12 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n15 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n15 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r1n21 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r1n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r1n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r1n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r2n03 - 3 device=3 binding=--interleave=6,7
 tu-c0r2n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r2n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r2n00 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses
 AcceleratorCudaInit: a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-setdevice=no 
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 0 device 0 bus id: 0000:03:00.0
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 AcceleratorCudaInit: ================================================
 local rank 2 device 0 bus id: 0000:84:00.0
 local rank 1 device 0 bus id: 0000:44:00.0
 local rank 3 device 0 bus id: 0000:C4:00.0
 SharedMemoryMpi:  World communicator of size 64
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x146500000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=188d2c7a4dc77807b545f5f2813cdb589b9e44ca: (HEAD -> develop, gh/develop, gh/HEAD) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.503072 s : Grid Layout
 Grid : Message : 1.503076 s : 	Global lattice size  : 48 48 48 96 
 Grid : Message : 1.503081 s : 	OpenMP threads       : 4
 Grid : Message : 1.503083 s : 	MPI tasks            : 2 2 2 8 
 Grid : Message : 1.518479 s : Making s innermost grids
 Grid : Message : 1.535611 s : Initialising 4d RNG
 Grid : Message : 1.551229 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.551252 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.805667 s : Initialising 5d RNG
 Grid : Message : 2.356490 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 2.357030 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 7.303785 s : Initialised RNGs
 Grid : Message : 8.385261 s : Drawing gauge field
 Grid : Message : 8.496485 s : Random gauge initialised 
 Grid : Message : 8.509783 s : Setting up Cshift based reference 
 Grid : Message : 13.609539 s : *****************************************************************
 Grid : Message : 13.609564 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 13.609566 s : *****************************************************************
 Grid : Message : 13.609568 s : *****************************************************************
 Grid : Message : 13.609573 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 13.609575 s : * Vectorising space-time by 8
 Grid : Message : 13.609577 s : * VComplexF size is 64 B
 Grid : Message : 13.609579 s : * SINGLE precision 
 Grid : Message : 13.609582 s : * Using Overlapped Comms/Compute
 Grid : Message : 13.609584 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 13.609586 s : *****************************************************************
 Grid : Message : 14.155991 s : Called warmup
 Grid : Message : 98.420612 s : Called Dw 30000 times in 8.42644e+07 us
 Grid : Message : 98.420675 s : mflop/s =   7.983e+07
 Grid : Message : 98.420677 s : mflop/s per rank =  1.24734e+06
 Grid : Message : 98.420679 s : mflop/s per node =  4.98937e+06
 Grid : Message : 98.420681 s : RF  GiB/s (base 2) =   162213
 Grid : Message : 98.420683 s : mem GiB/s (base 2) =   101383
 Grid : Message : 98.421254 s : norm diff   1.05775e-13
 Grid : Message : 98.431170 s : #### Dhop calls report 
 Grid : Message : 98.431178 s : WilsonFermion5D Number of DhopEO Calls   : 60002
 Grid : Message : 98.431182 s : WilsonFermion5D TotalTime   /Calls        : 1405.63 us
 Grid : Message : 98.431184 s : WilsonFermion5D CommTime    /Calls        : 961.451 us
 Grid : Message : 98.431186 s : WilsonFermion5D FaceTime    /Calls        : 222.433 us
 Grid : Message : 98.431188 s : WilsonFermion5D ComputeTime1/Calls        : 2.80214 us
 Grid : Message : 98.431190 s : WilsonFermion5D ComputeTime2/Calls        : 234.1 us
 Grid : Message : 98.431212 s : Average mflops/s per call                : 3.60793e+10
 Grid : Message : 98.431216 s : Average mflops/s per call per rank       : 5.63738e+08
 Grid : Message : 98.431218 s : Average mflops/s per call per node       : 2.25495e+09
 Grid : Message : 98.431220 s : Average mflops/s per call (full)         : 8.12107e+07
 Grid : Message : 98.431224 s : Average mflops/s per call per rank (full): 1.26892e+06
 Grid : Message : 98.431226 s : Average mflops/s per call per node (full): 5.07567e+06
 Grid : Message : 98.431229 s : WilsonFermion5D Stencil
 Grid : Message : 98.431230 s : WilsonFermion5D StencilEven
 Grid : Message : 98.431235 s : WilsonFermion5D StencilOdd
 Grid : Message : 98.431239 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 98.431240 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 98.431241 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 107.161203 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 107.161230 s : Called DwDag
 Grid : Message : 107.161231 s : norm dag result 12.0422
 Grid : Message : 107.163717 s : norm dag ref    12.0422
 Grid : Message : 107.166717 s : norm dag diff   7.28899e-14
 Grid : Message : 107.181064 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 107.248613 s : src_e0.5
 Grid : Message : 107.314227 s : src_o0.5
 Grid : Message : 107.331787 s : *********************************************************
 Grid : Message : 107.331790 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 107.331792 s : * Vectorising space-time by 8
 Grid : Message : 107.331794 s : * SINGLE precision 
 Grid : Message : 107.331795 s : * Using Overlapped Comms/Compute
 Grid : Message : 107.331796 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 107.331797 s : *********************************************************
 Grid : Message : 152.337360 s : Deo mflop/s =   7.47496e+07
 Grid : Message : 152.337387 s : Deo mflop/s per rank   1.16796e+06
 Grid : Message : 152.337390 s : Deo mflop/s per node   4.67185e+06
 Grid : Message : 152.337396 s : #### Dhop calls report 
 Grid : Message : 152.337399 s : WilsonFermion5D Number of DhopEO Calls   : 30001
 Grid : Message : 152.337402 s : WilsonFermion5D TotalTime   /Calls        : 1500 us
 Grid : Message : 152.337405 s : WilsonFermion5D CommTime    /Calls        : 1002.91 us
 Grid : Message : 152.337408 s : WilsonFermion5D FaceTime    /Calls        : 282.963 us
 Grid : Message : 152.337410 s : WilsonFermion5D ComputeTime1/Calls        : 4.71911 us
 Grid : Message : 152.337412 s : WilsonFermion5D ComputeTime2/Calls        : 237.647 us
 Grid : Message : 152.337435 s : Average mflops/s per call                : 2.07759e+10
 Grid : Message : 152.337439 s : Average mflops/s per call per rank       : 3.24624e+08
 Grid : Message : 152.337441 s : Average mflops/s per call per node       : 1.29849e+09
 Grid : Message : 152.337445 s : Average mflops/s per call (full)         : 7.61013e+07
 Grid : Message : 152.337448 s : Average mflops/s per call per rank (full): 1.18908e+06
 Grid : Message : 152.337451 s : Average mflops/s per call per node (full): 4.75633e+06
 Grid : Message : 152.337453 s : WilsonFermion5D Stencil
 Grid : Message : 152.337456 s : WilsonFermion5D StencilEven
 Grid : Message : 152.337457 s : WilsonFermion5D StencilOdd
 Grid : Message : 152.337459 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 152.337462 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 152.337463 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 152.358219 s : r_e6.02106
 Grid : Message : 152.359968 s : r_o6.0211
 Grid : Message : 152.361373 s : res12.0422
 Grid : Message : 152.467780 s : norm diff   0
 Grid : Message : 152.609427 s : norm diff even  0
 Grid : Message : 152.675745 s : norm diff odd   0
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/nodes
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/nodes
@ -0,0 +1 @@
 tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/script
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/script
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC1091,SC2050,SC2170
 # using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
 #SBATCH -J power-16A-1125
 #SBATCH -A dp207
 #SBATCH -t 48:00:00
 #SBATCH --nodes=16
 #SBATCH --ntasks=64
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=8
 #SBATCH --partition=gpu
 #SBATCH --gres=gpu:4
 #SBATCH --output=%x.%j.out
 #SBATCH --error=%x.%j.err
 #SBATCH --reservation=dc-port1_61
 #SBATCH --qos=reservation
 #SBATCH --no-requeue
 set -e
 # OpenMP/OpenMPI/UCX environment ###############################################
 export OMP_NUM_THREADS=4
 export OMPI_MCA_btl=^uct,openib
 export OMPI_MCA_pml=ucx
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
 export UCX_RNDV_SCHEME=put_zcopy
 export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 export UCX_MEMTYPE_CACHE=n
 # IO environment ###############################################################
 if [ 16 -eq 1 ]; then
 	export OMPI_MCA_io=ompio
 else
 	export OMPI_MCA_io=romio321
 fi
 export OMPI_MCA_btl_openib_allow_ib=true
 export OMPI_MCA_btl_openib_device_type=infiniband
 export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
 # load environment #############################################################
 env_dir="$(readlink -f /mnt/lustre/tursafs1/home/dp207/dp207/shared/env/versions/220428)"
 source "${env_dir}/env-base.sh"
 if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
 	source "${env_dir}/env-gpu.sh"
 else
 	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
  exit 1
 fi
 spack load sshpass
 # application and parameters ###################################################
 app='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32'
 opt=('--comms-overlap' '--comms-concurrent')
 par=''
 # collect job information ######################################################
 job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
 mkdir -p "${job_info_dir}"
 date                         > "${job_info_dir}/start-date"
 echo "epoch $(date '+%s')"   >> "${job_info_dir}/start-date"
 set                          > "${job_info_dir}/env"
 ldd ${app}                   > "${job_info_dir}/ldd"
 md5sum ${app}                > "${job_info_dir}/app-hash"
 readelf -a ${app}            > "${job_info_dir}/elf"
 echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
 cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
 if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
 # GPU frequency control ########################################################
 power_dir='/mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/'
 freq=1125
 # set frequency
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" "nvidia-smi -ac 1215,${freq}"
 done
 # start NVIDIA SMI monitoring
 tmp=$(mktemp)
 sleep 1
 coproc nvidia-smi dmon -o DT &> "${tmp}"
 # run! #########################################################################
 mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
 	./gpu-mpi-wrapper.sh \
  ${app} "${par}" "${opt[@]}" \
 	--mpi 2.2.2.8 \
  --accelerator-threads 8 \
 	--grid 48.48.48.96 \
 	--shm 2048 &> "${job_info_dir}/log"
 # if we reach that point the application exited successfully ###################
 touch "${job_info_dir}/success"
 date > "${job_info_dir}/end-date"
 echo "epoch $(date '+%s')" >> "${job_info_dir}/end-date"
 # reset GPUS ###################################################################
 # stop monitoring
 kill -INT "${COPROC_PID}"
 # make monitoring DB
 ${power_dir}/dmon-to-db.sh "${tmp}" smi-dmon-16A.db "clock_limit_${freq}"
 # reset clocks
 for h in $(scontrol show hostnames "${SLURM_JOB_NODELIST}"); do 
 	${power_dir}/remote-sudo.sh "$h" 'nvidia-smi -ac 1215,1410' 
 done
 ################################################################################
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/start-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/start-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:15:27 BST 2022
 epoch 1661026527
--- a/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/success
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1125.64095/success
--- a/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/app-hash
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/app-hash
@ -0,0 +1 @@
 6a99c164661d090b82990d130b305895  /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32
--- a/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/elf
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/elf
--- a/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/end-date
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/end-date
@ -0,0 +1,2 @@
 Sat Aug 20 21:23:53 BST 2022
 epoch 1661027033
--- a/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/env
+++ b/2-racks/size-C0/16-nodes/job/power-16A-1140.64100/env
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`6a99c164661d090b82990d130b305895 /mnt/lustre/tursafs1/home/dp207/dp207/dc-port1/power-bench/2-racks/Benchmark_dwf_fp32`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:25:12 BST 2022`
							`epoch 1661023512`
		`@ -0,0 +1 @@`
							`tu-c0r1n[00,03,06,09,12,15,18,21],tu-c0r2n[00,03,06,09,12,15,18,21]`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:22:21 BST 2022`
							`epoch 1661023341`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:37:35 BST 2022`
							`epoch 1661024255`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:34:46 BST 2022`
							`epoch 1661024086`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:43:25 BST 2022`
							`epoch 1661024605`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:40:36 BST 2022`
							`epoch 1661024436`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:49:15 BST 2022`
							`epoch 1661024955`
		`@ -0,0 +1,2 @@`
							`Sat Aug 20 20:46:27 BST 2022`
							`epoch 1661024788`