From a31af313289dcbef91af43820517e8ad89bc78c3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 18 May 2026 22:13:30 -0400
Subject: [PATCH] Lattice_reduction_gpu: add GRID_REDUCTION_TIMING
 instrumentation

Uncomment #define GRID_REDUCTION_TIMING to enable per-phase timing output:

  sumD_gpu_reduce_words: pack time (accelerator_for) per R and base
  sumD_gpu_small:        reduceKernel+barrier time and D2H time separately
  sumD_gpu_large:        total wall time across all word groups

This lets us identify whether the large-type bottleneck is in the pack
kernel, the shared-memory reduction kernel, the barrier, or the D2H.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Grid/lattice/Lattice_reduction_gpu.h | 44 ++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 96056671..84ef0a1a 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -197,12 +197,16 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Uncomment to print per-phase timing for every sumD_gpu_small and sumD_gpu_large call.
+// #define GRID_REDUCTION_TIMING
+
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
 {
   typedef typename vobj::scalar_objectD sobj;
   typedef decltype(lat) Iterator;
-  
+
   Integer nsimd= vobj::Nsimd();
   Integer size = osites*nsimd;
 
@@ -211,15 +215,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
   GRID_ASSERT(ok);
 
   Integer smemSize = numThreads * sizeof(sobj);
-  // Move out of UVM
-  // Turns out I had messed up the synchronise after move to compute stream
-  // as running this on the default stream fools the synchronise
   deviceVector<sobj> buffer(numBlocks);
   sobj *buffer_v = &buffer[0];
   sobj result;
+
+#ifdef GRID_REDUCTION_TIMING
+  RealD t_kernel = -usecond();
+#endif
   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
   accelerator_barrier();
+#ifdef GRID_REDUCTION_TIMING
+  t_kernel += usecond();
+  RealD t_d2h = -usecond();
+#endif
   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
+#ifdef GRID_REDUCTION_TIMING
+  t_d2h += usecond();
+  std::cout << GridLogMessage << "  sumD_gpu_small"
+            << " sizeof(sobj)=" << sizeof(sobj)
+            << " blocks=" << numBlocks << " threads=" << numThreads
+            << " kernel+barrier=" << t_kernel << " us"
+            << " D2H=" << t_d2h << " us" << std::endl;
+#endif
   return result;
 }
 
@@ -242,12 +259,20 @@ inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
   deviceVector<Bundle> buf(osites);
   Bundle *buf_p = &buf[0];
 
+#ifdef GRID_REDUCTION_TIMING
+  RealD t_pack = -usecond();
+#endif
   accelerator_for(ss, osites, 1, {
     Bundle b;
     for (int k = 0; k < R; k++)
       b._internal[k] = idat[ss * words + base + k];
     buf_p[ss] = b;
   });
+#ifdef GRID_REDUCTION_TIMING
+  t_pack += usecond();
+  std::cout << GridLogMessage << " sumD_gpu_reduce_words R=" << R
+            << " base=" << base << " pack=" << t_pack << " us" << std::endl;
+#endif
 
   auto sum_bundle = sumD_gpu_small(buf_p, osites);
   for (int k = 0; k < R; k++)
@@ -265,10 +290,19 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
   sobjD ret; zeroit(ret);
   scalarD *ret_p = (scalarD *)&ret;
 
+#ifdef GRID_REDUCTION_TIMING
+  RealD t_large = -usecond();
+#endif
   int w = 0;
   while (w + 12 <= words) { sumD_gpu_reduce_words<12>(lat, osites, ret_p, w); w += 12; }
   while (w +  4 <= words) { sumD_gpu_reduce_words< 4>(lat, osites, ret_p, w); w +=  4; }
   while (w      <  words) { sumD_gpu_reduce_words< 1>(lat, osites, ret_p, w); w +=  1; }
+#ifdef GRID_REDUCTION_TIMING
+  t_large += usecond();
+  std::cout << GridLogMessage << "sumD_gpu_large"
+            << " sizeof(sobjD)=" << sizeof(sobjD)
+            << " words=" << words << " total=" << t_large << " us" << std::endl;
+#endif
 
   return ret;
 }