From 26c3c7d8f9f8c30ed6b9e97ac57a3171847f2952 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 18 May 2026 21:56:45 -0400
Subject: [PATCH] sumD_gpu_large: radix-12 word-bundle reduction replacing
 radix-1

Replace the word-by-word loop (one kernel launch per scalar word) with
sumD_gpu_reduce_words<R> which packs R consecutive vector_type words per
site into iVector<iScalar<vector>,R>, then calls the existing sumD_gpu_small
shared-memory kernel once for the whole bundle.

Dispatch: radix-12 first, radix-4 for the remainder < 12, radix-1 for
any final < 4 words.  For LatticePropagator (144 words = 12x12), this
reduces the kernel-launch count from 144 to 12 -- a 12x reduction.

Bundle::Nsimd() inherits from vector_type so sumD_gpu_small handles SIMD
lane extraction and double-precision promotion identically to the scalar
word case.  sizeof(Bundle::scalar_objectD) = R*16 <= 192 B; well within
sharedMemPerBlock on all supported devices.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Grid/lattice/Lattice_reduction_gpu.h | 58 ++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 17 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 849f5309..96056671 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -223,29 +223,53 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
   return result;
 }
 
+// Pack R consecutive vector_type words of lat[0..osites-1] starting at word
+// 'base' into a Bundle = iVector<iScalar<vector>,R> per site, then reduce
+// with sumD_gpu_small.  Bundle::Nsimd() == vector::Nsimd(), so the existing
+// shared-memory kernel handles SIMD-lane extraction and double-promotion
+// correctly.  sizeof(Bundle::scalar_objectD) = R*sizeof(scalarD) <= 192 B
+// for R<=12, safely within sharedMemPerBlock on all supported devices.
+template<int R, class vobj>
+inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
+                                   typename vobj::scalar_typeD *ret_p, int base)
+{
+  typedef typename vobj::vector_type vector;
+  using Bundle = iVector<iScalar<vector>, R>;
+
+  const int words = sizeof(vobj) / sizeof(vector);
+  iScalar<vector> *idat = (iScalar<vector> *)lat;
+
+  deviceVector<Bundle> buf(osites);
+  Bundle *buf_p = &buf[0];
+
+  accelerator_for(ss, osites, 1, {
+    Bundle b;
+    for (int k = 0; k < R; k++)
+      b._internal[k] = idat[ss * words + base + k];
+    buf_p[ss] = b;
+  });
+
+  auto sum_bundle = sumD_gpu_small(buf_p, osites);
+  for (int k = 0; k < R; k++)
+    ret_p[base + k] = TensorRemove(sum_bundle._internal[k]);
+}
+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobj;
-  sobj ret;
+  typedef typename vobj::vector_type    vector;
+  typedef typename vobj::scalar_typeD   scalarD;
+  typedef typename vobj::scalar_objectD sobjD;
+
+  const int words = sizeof(vobj) / sizeof(vector);
+  sobjD ret; zeroit(ret);
   scalarD *ret_p = (scalarD *)&ret;
-  
-  const int words = sizeof(vobj)/sizeof(vector);
 
-  deviceVector<vector> buffer(osites);
-  vector *dat = (vector *)lat;
-  vector *buf = &buffer[0];
-  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
-  for(int w=0;w<words;w++) {
+  int w = 0;
+  while (w + 12 <= words) { sumD_gpu_reduce_words<12>(lat, osites, ret_p, w); w += 12; }
+  while (w +  4 <= words) { sumD_gpu_reduce_words< 4>(lat, osites, ret_p, w); w +=  4; }
+  while (w      <  words) { sumD_gpu_reduce_words< 1>(lat, osites, ret_p, w); w +=  1; }
 
-    accelerator_for(ss,osites,1,{
-	buf[ss] = dat[ss*words+w];
-      });
-      
-    ret_p[w] = sumD_gpu_small(tbuf,osites);
-  }
   return ret;
 }