From d358954a84ff6d3da4291136d24b3eb191ce16cf Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 18 May 2026 21:08:10 -0400
Subject: [PATCH] sumD_gpu_direct: shared-memory lane reduction with
 acceleratorThreads(1)

Set acceleratorThreads to 1 before the extraction kernel so that
dim3(nsimd,1,1) blocks give exactly one site group per block and
__shared__ sobjD smem[nsimd] is correctly sized without depending on
the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane)
indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1]
and writes one sobjD per site. CUB then reduces osites elements instead
of osites*nsimd, reducing both store traffic and CUB work by Nsimd.
acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Grid/lattice/Lattice_reduction_gpu_cub.h | 40 +++++++++++++++---------
 1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu_cub.h b/Grid/lattice/Lattice_reduction_gpu_cub.h
index afea7571..274371f1 100644
--- a/Grid/lattice/Lattice_reduction_gpu_cub.h
+++ b/Grid/lattice/Lattice_reduction_gpu_cub.h
@@ -63,24 +63,36 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
   typedef typename vobj::scalar_object  sobj;
   typedef typename vobj::scalar_objectD sobjD;
 
-  const Integer nsimd    = vobj::Nsimd();
-  const Integer nlanes   = osites * nsimd;
+  constexpr Integer nsimd = vobj::Nsimd();
 
-  deviceVector<sobjD> per_lane(nlanes);
-  sobjD *per_lane_p = &per_lane[0];
+  deviceVector<sobjD> per_site(osites);
+  sobjD *per_site_p = &per_site[0];
+
+  // Set acceleratorThreads=1 so the block is dim3(nsimd,1,1): one site per block.
+  // smem[nsimd] is then exactly sized; no other site group competes for the same slots.
+  // Restore after the (synchronous) accelerator_for.
+  uint32_t saved_threads = acceleratorThreads();
+  acceleratorThreads(1);
 
 #ifdef GRID_REDUCTION_TIMING
   RealD t_for = -usecond();
 #endif
-  accelerator_for(idx, nlanes, 1, {
-    Integer ss   = idx / nsimd;
-    Integer lane = idx % nsimd;
-    sobj tmp = extractLane(lane, lat[ss]);
+  accelerator_for(ss, osites, nsimd, {
+    int lane = acceleratorSIMTlane(nsimd);
+    __shared__ sobjD smem[nsimd];
+    sobj tmp  = extractLane(lane, lat[ss]);
     sobjD tmpD; tmpD = tmp;
-    per_lane_p[idx] = tmpD;
+    smem[lane] = tmpD;
+    acceleratorSynchronise();
+    if (lane == 0) {
+      sobjD acc = smem[0];
+      for (int l = 1; l < nsimd; l++) acc = acc + smem[l];
+      per_site_p[ss] = acc;
+    }
   });
+  // accelerator_for already issued accelerator_barrier(); restore thread count now.
+  acceleratorThreads(saved_threads);
 #ifdef GRID_REDUCTION_TIMING
-  accelerator_barrier();
   t_for += usecond();
 #endif
 
@@ -90,8 +102,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
   size_t temp_bytes = 0;
 
   gpuError_t gpuErr;
-  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
-                                        (int)nlanes, gpucub::Sum(), zero, computeStream);
+  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
+                                        (int)osites, gpucub::Sum(), zero, computeStream);
   if (gpuErr != gpuSuccess) {
     std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
               << gpuErr << std::endl;
@@ -103,8 +115,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
 #ifdef GRID_REDUCTION_TIMING
   RealD t_cub = -usecond();
 #endif
-  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
-                                        (int)nlanes, gpucub::Sum(), zero, computeStream);
+  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
+                                        (int)osites, gpucub::Sum(), zero, computeStream);
   if (gpuErr != gpuSuccess) {
     std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
               << gpuErr << std::endl;