From aee00bdfb5e1bfd01a73d06a612afa3e258f5e50 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 18 May 2026 16:21:50 -0400
Subject: [PATCH] sumD_gpu_direct: one thread per SIMD lane using extractLane
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces one thread per outer site calling Reduce() (sequential Nsimd-wide
loop) with one thread per lane calling extractLane() — O(1) per thread.
CUB now reduces over osites*Nsimd elements. Avoids serial lane reduction
but leaves the per-lane sobjD store stride as a known remaining concern.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Grid/lattice/Lattice_reduction_gpu_cub.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu_cub.h b/Grid/lattice/Lattice_reduction_gpu_cub.h
index 6a732de4..afea7571 100644
--- a/Grid/lattice/Lattice_reduction_gpu_cub.h
+++ b/Grid/lattice/Lattice_reduction_gpu_cub.h
@@ -63,16 +63,21 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
   typedef typename vobj::scalar_object  sobj;
   typedef typename vobj::scalar_objectD sobjD;
 
-  deviceVector<sobjD> per_site(osites);
-  sobjD *per_site_p = &per_site[0];
+  const Integer nsimd    = vobj::Nsimd();
+  const Integer nlanes   = osites * nsimd;
+
+  deviceVector<sobjD> per_lane(nlanes);
+  sobjD *per_lane_p = &per_lane[0];
 
 #ifdef GRID_REDUCTION_TIMING
   RealD t_for = -usecond();
 #endif
-  accelerator_for(ss, osites, 1, {
-    sobj tmp = Reduce(lat[ss]);
+  accelerator_for(idx, nlanes, 1, {
+    Integer ss   = idx / nsimd;
+    Integer lane = idx % nsimd;
+    sobj tmp = extractLane(lane, lat[ss]);
     sobjD tmpD; tmpD = tmp;
-    per_site_p[ss] = tmpD;
+    per_lane_p[idx] = tmpD;
   });
 #ifdef GRID_REDUCTION_TIMING
   accelerator_barrier();
@@ -85,8 +90,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
   size_t temp_bytes = 0;
 
   gpuError_t gpuErr;
-  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
-                                        (int)osites, gpucub::Sum(), zero, computeStream);
+  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
+                                        (int)nlanes, gpucub::Sum(), zero, computeStream);
   if (gpuErr != gpuSuccess) {
     std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
               << gpuErr << std::endl;
@@ -98,8 +103,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
 #ifdef GRID_REDUCTION_TIMING
   RealD t_cub = -usecond();
 #endif
-  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
-                                        (int)osites, gpucub::Sum(), zero, computeStream);
+  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
+                                        (int)nlanes, gpucub::Sum(), zero, computeStream);
   if (gpuErr != gpuSuccess) {
     std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
               << gpuErr << std::endl;