From b2ce760cf4c5ceb7ee537e7b90f871735a65a772 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 25 Feb 2025 16:55:23 +0000
Subject: [PATCH 01/15] Verbose issue with GPT

---
 Grid/util/Lexicographic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h
index 422e42ee..bd73b0d3 100644
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -50,7 +50,7 @@ namespace Grid{
       int64_t index64;
       IndexFromCoorReversed(coor,index64,dims);
       if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	std::cout << " IndexFromCoorReversed overflow"<<std::endl;
       }
       assert(index64<2*1024*1024*1024LL);
       index = (int) index64;

From 438dfbdb8340919525a404f20e42d908407a74ee Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 25 Feb 2025 16:57:27 +0000
Subject: [PATCH 02/15] Only throw if there is a pending list entry in
 CommsComplete

---
 Grid/communicator/Communicator_none.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index f162a903..3dee8f4d 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
   assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,

From 311e2aab3f48fc76923ae6a25b723de25317396f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 26 Feb 2025 11:42:52 -0500
Subject: [PATCH 03/15] Update Accelerator.h

---
 Grid/threads/Accelerator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index f3371f98..b5aaccb4 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -674,7 +674,7 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
   acceleratorCopySynchronise();
 }
 
-template<class T> void acceleratorPut(T& dev,T&host)
+template<class T> void acceleratorPut(T& dev,const T&host)
 {
   acceleratorCopyToDevice(&host,&dev,sizeof(T));
 }

From 6ae809ed40937718d17402710f659685914dbbc7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Thu, 27 Feb 2025 20:11:45 +0000
Subject: [PATCH 04/15] Print not liked on GPT compile

---
 Grid/util/Lexicographic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h
index bd73b0d3..b4063e48 100644
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -50,7 +50,7 @@ namespace Grid{
       int64_t index64;
       IndexFromCoorReversed(coor,index64,dims);
       if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed overflow"<<std::endl;
+	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
       }
       assert(index64<2*1024*1024*1024LL);
       index = (int) index64;

From c4d9aa1a21c942a5910b32240470d7ab9e4ba6bc Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Thu, 27 Feb 2025 20:12:12 +0000
Subject: [PATCH 05/15] Config command that makes GPT happier

---
 systems/Aurora/config-command | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command
index 6e5512ff..08b77f4f 100644
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -1,18 +1,19 @@
 #Ahead of time compile for PVC
 
-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
 
 #JIT compile 
 #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 
-../../configure \
+../configure \
 	--enable-simd=GPU \
 	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
+	--prefix $HOME/gpt-install \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--with-lime=$CLIME \

From eb8a008a8f1f5f8b8df2fc656faa8bf3b8342ab5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 12:41:59 -0500
Subject: [PATCH 06/15] Create WorkArounds.txt

---
 systems/WorkArounds.txt | 90 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 systems/WorkArounds.txt

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
new file mode 100644
index 00000000..4d432589
--- /dev/null
+++ b/systems/WorkArounds.txt
@@ -0,0 +1,90 @@
+The purpose of this file is to collate all non-obvious known magic shell variables
+and compiler flags required for either correctness or performance on various systems.
+
+A repository of work-arounds.
+
+Contents:
+1. Interconnect + MPI
+2. Compilation
+
+
+* 1. INTERCONNECT + MPI
+
+--------------------------------------------------------------------
+MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
+--------------------------------------------------------------------
+export OMPI_MCA_io=romio321
+
+--------------------------------------
+ROMIO fail with > 2GB per node read (32 bit issue)
+--------------------------------------
+
+Use later MPICH
+
+https://github.com/paboyle/Grid/issues/381
+
+https://github.com/pmodels/mpich/commit/3a479ab0
+
+--------------------------------------------------------------------
+Slingshot: Frontier and Perlmutter libfabric slow down 
+and physical memory fragmentation 
+--------------------------------------------------------------------
+export FI_MR_CACHE_MONITOR=disabled
+or
+export FI_MR_CACHE_MONITOR=kdreg2
+
+--------------------------------------------------------------------
+Mellanox performance with A100 GPU 
+--------------------------------------------------------------------
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+
+--------------------------------------------------------------------
+Mellanox + A100 correctness 
+--------------------------------------------------------------------
+export UCX_MEMTYPE_CACHE=n
+
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance (Peter Boyle)
+--------------------------------------------------------------------
+
+https://github.com/pmodels/mpich/issues/7302
+
+--enable-cuda-aware-mpi=no  
+(Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI)
+
+Ideally use MPICH with fix:
+
+https://github.com/pmodels/mpich/pull/7312
+
+Ideally:
+MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
+
+Alternatives:
+export MPIR_CVAR_NOLOCAL=1
+export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
+
+
+* 2. COMPILATION
+
+--------------------------------------------------------------------
+G++ bugs
+--------------------------------------------------------------------
+
+
+--------------------------------------------------------------------
+AMD GPU nodes : multiple ROCM versions broken; use 5.3.0
+--------------------------------------------------------------------
+https://github.com/paboyle/Grid/issues/464
+
+--------------------------------------------------------------------
+Aurora/PVC
+
+SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
+SYCL slow link and relocatable code issues (Christoph Lehner)
+--------------------------------------------------------------------
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"

From bc12dbbb38570ef4d7bbfd703204426b5179f466 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 12:48:56 -0500
Subject: [PATCH 07/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index 4d432589..b9570cbf 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -71,9 +71,28 @@ export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
 * 2. COMPILATION
 
 --------------------------------------------------------------------
-G++ bugs
+G++ compiler breakage / graveyard
 --------------------------------------------------------------------
 
+9.3.0, 10.3.1, 
+https://github.com/paboyle/Grid/issues/290
+https://github.com/paboyle/Grid/issues/264
+
+Working (-) Broken (X):
+
+4.9.0 -
+4.9.1 -
+5.1.0 X
+5.2.0 X
+5.3.0 X
+5.4.0 X
+6.1.0 X
+6.2.0 X
+6.3.0 -
+7.1.0 -
+8.0.0 (HEAD) -
+
+https://github.com/paboyle/Grid/issues/100
 
 --------------------------------------------------------------------
 AMD GPU nodes : multiple ROCM versions broken; use 5.3.0

From 3624bd3d220a0b7a8a8e7792c3228b5630742843 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 13:45:09 -0500
Subject: [PATCH 08/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 63 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 5 deletions(-)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index b9570cbf..96aa602b 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -7,8 +7,9 @@ Contents:
 1. Interconnect + MPI
 2. Compilation
 
-
+************************
 * 1. INTERCONNECT + MPI
+************************
 
 --------------------------------------------------------------------
 MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O 
@@ -33,6 +34,31 @@ export FI_MR_CACHE_MONITOR=disabled
 or
 export FI_MR_CACHE_MONITOR=kdreg2
 
+
+--------------------------------------------------------------------
+Frontier/LumiG
+--------------------------------------------------------------------
+
+Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
+
+cat << EOF > select_gpu
+#!/bin/bash
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+chmod +x ./select_gpu
+
+srun ./select_gpu BINARY
+
+
 --------------------------------------------------------------------
 Mellanox performance with A100 GPU 
 --------------------------------------------------------------------
@@ -54,9 +80,12 @@ MPICH/Aurora/PVC correctness and performance (Peter Boyle)
 https://github.com/pmodels/mpich/issues/7302
 
 --enable-cuda-aware-mpi=no  
-(Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI)
+--enable-unified=no
 
-Ideally use MPICH with fix:
+Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
+Do not use SVM
+
+Ideally use MPICH with fix to issue 7302:
 
 https://github.com/pmodels/mpich/pull/7312
 
@@ -67,8 +96,24 @@ Alternatives:
 export MPIR_CVAR_NOLOCAL=1
 export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
 
+--------------------------------------------------------------------
+MPICH/Aurora/PVC correctness and performance (James Osborne)
+--------------------------------------------------------------------
 
+Broken:
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+This gives good peformance without requiring 
+--enable-cuda-aware-mpi=no  
+
+But is an open issue reported by James Osborne
+https://github.com/pmodels/mpich/issues/7139
+
+Possibly resolved but unclear if in the installed software yet.
+
+************************
 * 2. COMPILATION
+************************
 
 --------------------------------------------------------------------
 G++ compiler breakage / graveyard
@@ -95,15 +140,23 @@ Working (-) Broken (X):
 https://github.com/paboyle/Grid/issues/100
 
 --------------------------------------------------------------------
-AMD GPU nodes : multiple ROCM versions broken; use 5.3.0
+AMD GPU nodes :
 --------------------------------------------------------------------
+
+multiple ROCM versions broken; use 5.3.0
+manifests itself as wrong results in fp32 
+
 https://github.com/paboyle/Grid/issues/464
 
 --------------------------------------------------------------------
 Aurora/PVC
+--------------------------------------------------------------------
 
 SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
 SYCL slow link and relocatable code issues (Christoph Lehner)
---------------------------------------------------------------------
+Opt large register file required for good performance in fp64
+
+
+export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"

From 267a39d9433fc2defd143b438812a0d4526613c3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 13:49:43 -0500
Subject: [PATCH 09/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index 96aa602b..1c3b8084 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -34,6 +34,14 @@ export FI_MR_CACHE_MONITOR=disabled
 or
 export FI_MR_CACHE_MONITOR=kdreg2
 
+--------------------------------------------------------------------
+Perlmutter
+--------------------------------------------------------------------
+
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_IPC_ENABLED=1
+export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
+export MPICH_GPU_NO_ASYNC_MEMCPY=0
 
 --------------------------------------------------------------------
 Frontier/LumiG

From 795769c636bc24f72ea8e3b8ee4cd952366a388a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 13:50:41 -0500
Subject: [PATCH 10/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index 1c3b8084..a62ba495 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -68,7 +68,7 @@ srun ./select_gpu BINARY
 
 
 --------------------------------------------------------------------
-Mellanox performance with A100 GPU 
+Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
 --------------------------------------------------------------------
 export OMPI_MCA_btl=^uct,openib
 export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
@@ -77,12 +77,12 @@ export UCX_RNDV_THRESH=16384
 export UCX_IB_GPU_DIRECT_RDMA=yes
 
 --------------------------------------------------------------------
-Mellanox + A100 correctness 
+Mellanox + A100 correctness (Tursa, Booster, Leonardo)
 --------------------------------------------------------------------
 export UCX_MEMTYPE_CACHE=n
 
 --------------------------------------------------------------------
-MPICH/Aurora/PVC correctness and performance (Peter Boyle)
+MPICH/Aurora/PVC correctness and performance 
 --------------------------------------------------------------------
 
 https://github.com/pmodels/mpich/issues/7302
@@ -105,7 +105,7 @@ export MPIR_CVAR_NOLOCAL=1
 export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
 
 --------------------------------------------------------------------
-MPICH/Aurora/PVC correctness and performance (James Osborne)
+MPICH/Aurora/PVC correctness and performance 
 --------------------------------------------------------------------
 
 Broken:
@@ -114,7 +114,7 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 This gives good peformance without requiring 
 --enable-cuda-aware-mpi=no  
 
-But is an open issue reported by James Osborne
+But is an open issue reported by James Osborn
 https://github.com/pmodels/mpich/issues/7139
 
 Possibly resolved but unclear if in the installed software yet.

From e8ff9d8e50ee929dbd9505b0b32b591511ef27d5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 14:00:04 -0500
Subject: [PATCH 11/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index a62ba495..7bae046f 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -168,3 +168,21 @@ Opt large register file required for good performance in fp64
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
+
+
+************************
+* 3. Visual profile tools
+************************
+
+--------------------------------------------------------------------
+Frontier/rocprof
+--------------------------------------------------------------------
+
+--------------------------------------------------------------------
+Aurora/unitrace
+--------------------------------------------------------------------
+
+
+--------------------------------------------------------------------
+Tursa/nsight-sys
+--------------------------------------------------------------------

From ad6db92690f0d22470971115ad9f4c51dea281a0 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 14:00:26 -0500
Subject: [PATCH 12/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index 7bae046f..eda07223 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -6,6 +6,7 @@ A repository of work-arounds.
 Contents:
 1. Interconnect + MPI
 2. Compilation
+3. Profiling
 
 ************************
 * 1. INTERCONNECT + MPI

From a1cdda833fc1652dc8916233ed3f942d3a0337fb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Wed, 5 Mar 2025 14:04:23 -0500
Subject: [PATCH 13/15] Update WorkArounds.txt

---
 systems/WorkArounds.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt
index eda07223..7191b4ff 100644
--- a/systems/WorkArounds.txt
+++ b/systems/WorkArounds.txt
@@ -170,6 +170,23 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" 
 export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fPIC"
 
+--------------------------------------------------------------------
+Aurora/PVC useful extra options
+--------------------------------------------------------------------
+
+Host only sanitizer:
+-Xarch_host -fsanitize=leak
+-Xarch_host -fsanitize=address
+
+Deterministic MPI reduction:
+export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
+export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
+unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
+unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
+
+
 
 ************************
 * 3. Visual profile tools

From 1d22841811b3378d38e69809bf8d73e35fbd6451 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Thu, 6 Mar 2025 03:19:35 +0000
Subject: [PATCH 14/15] Working on aurora, GPT issue turned up is fixed

---
 Grid/communicator/Communicator_mpi3.cc | 30 ++++++++++++++++----------
 Grid/threads/Accelerator.h             |  2 +-
 Grid/util/Init.cc                      | 10 ++++++++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index b667d32e..38b9f9c6 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -746,26 +746,34 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  //  int nreq=list.size();
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
 
-  //  if (nreq==0) return;
-  //  std::vector<MPI_Status> status(nreq);
-  //  std::vector<MPI_Request> MpiRequests(nreq);
+  std::vector<MPI_Status> status;
+  std::vector<MPI_Request> MpiRequests;
+    
+  for(int r=0;r<list.size();r++){
+    // Must check each Send buf is clear to reuse
+    if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
+    //    if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
+  }
 
-  //  for(int r=0;r<nreq;r++){
-  //    MpiRequests[r] = list[r].req;
-  //  }
+  int nreq=MpiRequests.size();
+
+  std::cout << GridLogMessage << " StencilSendToRevFromComplete "<<nreq<<" Mpi Requests"<<std::endl;
+
+
+  if (nreq>0) {
+    status.resize(MpiRequests.size());
+    int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+    assert(ierr==0);
+  }
   
-  //  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
-  //  assert(ierr==0);
-
   //  for(int r=0;r<nreq;r++){
   //    if ( list[r].PacketType==InterNodeRecv ) {
   //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
   //    }
   //  }
   
-  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
   
   list.resize(0);               // Delete the list
   this->HostBufferFreeAll();    // Clean up the buffer allocs
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index b5aaccb4..28c3aa0a 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -676,7 +676,7 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 
 template<class T> void acceleratorPut(T& dev,const T&host)
 {
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
+  acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
 }
 template<class T> T acceleratorGet(T& dev)
 {
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 1424667e..feb44645 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
 
-
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
+  
   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@@ -651,3 +658,4 @@ void Grid_debug_handler_init(void)
 }
 
 NAMESPACE_END(Grid);
+

From 3d014864e24f753a84c5bb4966eff12696b0509e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 6 Mar 2025 14:18:43 -0500
Subject: [PATCH 15/15] Makinig LLVM happy

---
 Grid/threads/Accelerator.h        | 38 ++++++++---------
 Grid/threads/Threads.h            |  6 +--
 benchmarks/Benchmark_usqcd.cc     | 69 +++++++++++++++++--------------
 systems/sdcc-genoa/bench.slurm    | 32 ++++++++++++++
 systems/sdcc-genoa/bench2.slurm   | 36 ++++++++++++++++
 systems/sdcc-genoa/config-command | 16 +++++++
 systems/sdcc-genoa/sourceme.sh    |  4 ++
 7 files changed, 147 insertions(+), 54 deletions(-)
 create mode 100644 systems/sdcc-genoa/bench.slurm
 create mode 100644 systems/sdcc-genoa/bench2.slurm
 create mode 100644 systems/sdcc-genoa/config-command
 create mode 100644 systems/sdcc-genoa/sourceme.sh

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 28c3aa0a..b2a40e7b 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
+inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
   cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
@@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
   return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
 }
 
-inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
-inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
-inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
 
 inline int  acceleratorIsCommunicable(void *ptr)
@@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
 
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 {
   auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
   auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
 }
-inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
+inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
   auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
 }
 inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
@@ -583,9 +583,9 @@ inline void acceleratorMem(void)
 
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@@ -668,7 +668,7 @@ accelerator_inline void acceleratorFence(void)
   return;
 }
 
-inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
+inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
 {
   acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
   acceleratorCopySynchronise();
diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
index 6887134d..cdb4fa62 100644
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_critical                                     DO_PRAGMA(omp critical)
 
 #ifdef GRID_OMP
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
-  uint64_t *ufrom = (uint64_t *)from;
+  const uint64_t *ufrom = (const uint64_t *)from;
   uint64_t *uto   = (uint64_t *)to;
   assert(bytes%8==0);
   uint64_t words=bytes/8;
@@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
   });
 }
 #else
-inline void thread_bcopy(void *from, void *to,size_t bytes)
+inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
   bcopy(from,to,bytes);
 }
diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc
index e400138b..4b50121e 100644
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -492,17 +492,18 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;
 
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Dw.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Dw.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -520,11 +521,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -535,6 +536,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
 
       }
 
@@ -654,17 +656,19 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 500;
 
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t no    = 50;
+	uint64_t ni    = 100;
 
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
-	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
@@ -675,11 +679,11 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
@@ -689,6 +693,7 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
       
       }
 
@@ -792,19 +797,18 @@ public:
 	  Dc.M(src,r);
 	}
 	FGrid->Barrier();
-	double t1=usecond();
-	uint64_t ncall = 500;
-
-	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+	uint64_t ni = 100;
+	uint64_t no = 50;
 
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-
 	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  Dc.M(src,r);
-	  t1=usecond();
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  double t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Dc.M(src,r);
+	  }
+	  double t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
@@ -814,20 +818,21 @@ public:
 	double mf_hi, mf_lo, mf_err;
 	
 	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 
-	mflops = flops/timestat.mean;
+	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call   "<< timestat.mean/ni<<std::endl;
       
       }
 
@@ -872,7 +877,7 @@ int main (int argc, char ** argv)
   int do_dslash=1;
 
   int sel=4;
-  std::vector<int> L_list({8,12,16,24});
+  std::vector<int> L_list({8,12,16,24,32});
   int selm1=sel-1;
 
   std::vector<double> clover;
diff --git a/systems/sdcc-genoa/bench.slurm b/systems/sdcc-genoa/bench.slurm
new file mode 100644
index 00000000..2c7f6c32
--- /dev/null
+++ b/systems/sdcc-genoa/bench.slurm
@@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=1
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+for t in  2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
+for vol in 48.48.48.96
+do
+srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
+done
+#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
+done
+
diff --git a/systems/sdcc-genoa/bench2.slurm b/systems/sdcc-genoa/bench2.slurm
new file mode 100644
index 00000000..be21c816
--- /dev/null
+++ b/systems/sdcc-genoa/bench2.slurm
@@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH --partition lqcd
+#SBATCH --time=00:50:00
+#SBATCH -A lqcdtest
+#SBATCH -q lqcd
+#SBATCH --exclusive
+#SBATCH --nodes=2
+#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
+#SBATCH --ntasks=2
+#SBATCH --cpus-per-task=64
+#SBATCH --qos lqcd
+
+source sourceme.sh
+
+export PLACES=(1:16:4 1:32:2 0:64:1);
+export THR=(16 32 64)
+
+nodes=2
+mpi=1.1.1.2
+
+for t in 2 
+do
+   
+export OMP_NUM_THREADS=${THR[$t]}
+export OMP_PLACES=${PLACES[$t]}
+export thr=${THR[$t]}
+
+#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
+
+for vol in 64.64.64.128
+do
+srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
+done
+
+done
+
diff --git a/systems/sdcc-genoa/config-command b/systems/sdcc-genoa/config-command
new file mode 100644
index 00000000..d992e1da
--- /dev/null
+++ b/systems/sdcc-genoa/config-command
@@ -0,0 +1,16 @@
+../../configure \
+--enable-comms=mpi-auto \
+--enable-unified=yes \
+--enable-shm=shmopen \
+--enable-shm-fast-path=shmopen \
+--enable-accelerator=none \
+--enable-simd=AVX512 \
+--disable-accelerator-cshift \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=clang++ \
+MPICXX=mpicxx \
+CXXFLAGS="-std=c++17"
+
+
+
diff --git a/systems/sdcc-genoa/sourceme.sh b/systems/sdcc-genoa/sourceme.sh
new file mode 100644
index 00000000..4f37888c
--- /dev/null
+++ b/systems/sdcc-genoa/sourceme.sh
@@ -0,0 +1,4 @@
+source $HOME/spack/share/spack/setup-env.sh
+spack load llvm@17.0.4
+export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
+module load openmpi