From 26c3c7d8f9f8c30ed6b9e97ac57a3171847f2952 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 18 May 2026 21:56:45 -0400 Subject: [PATCH] sumD_gpu_large: radix-12 word-bundle reduction replacing radix-1 Replace the word-by-word loop (one kernel launch per scalar word) with sumD_gpu_reduce_words which packs R consecutive vector_type words per site into iVector,R>, then calls the existing sumD_gpu_small shared-memory kernel once for the whole bundle. Dispatch: radix-12 first, radix-4 for the remainder < 12, radix-1 for any final < 4 words. For LatticePropagator (144 words = 12x12), this reduces the kernel-launch count from 144 to 12 -- a 12x reduction. Bundle::Nsimd() inherits from vector_type so sumD_gpu_small handles SIMD lane extraction and double-precision promotion identically to the scalar word case. sizeof(Bundle::scalar_objectD) = R*16 <= 192 B; well within sharedMemPerBlock on all supported devices. Co-Authored-By: Claude Sonnet 4.6 --- Grid/lattice/Lattice_reduction_gpu.h | 58 ++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 849f5309..96056671 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -223,29 +223,53 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi return result; } +// Pack R consecutive vector_type words of lat[0..osites-1] starting at word +// 'base' into a Bundle = iVector,R> per site, then reduce +// with sumD_gpu_small. Bundle::Nsimd() == vector::Nsimd(), so the existing +// shared-memory kernel handles SIMD-lane extraction and double-promotion +// correctly. sizeof(Bundle::scalar_objectD) = R*sizeof(scalarD) <= 192 B +// for R<=12, safely within sharedMemPerBlock on all supported devices. +template +inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites, + typename vobj::scalar_typeD *ret_p, int base) +{ + typedef typename vobj::vector_type vector; + using Bundle = iVector, R>; + + const int words = sizeof(vobj) / sizeof(vector); + iScalar *idat = (iScalar *)lat; + + deviceVector buf(osites); + Bundle *buf_p = &buf[0]; + + accelerator_for(ss, osites, 1, { + Bundle b; + for (int k = 0; k < R; k++) + b._internal[k] = idat[ss * words + base + k]; + buf_p[ss] = b; + }); + + auto sum_bundle = sumD_gpu_small(buf_p, osites); + for (int k = 0; k < R; k++) + ret_p[base + k] = TensorRemove(sum_bundle._internal[k]); +} + template inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) { - typedef typename vobj::vector_type vector; - typedef typename vobj::scalar_typeD scalarD; - typedef typename vobj::scalar_objectD sobj; - sobj ret; + typedef typename vobj::vector_type vector; + typedef typename vobj::scalar_typeD scalarD; + typedef typename vobj::scalar_objectD sobjD; + + const int words = sizeof(vobj) / sizeof(vector); + sobjD ret; zeroit(ret); scalarD *ret_p = (scalarD *)&ret; - - const int words = sizeof(vobj)/sizeof(vector); - deviceVector buffer(osites); - vector *dat = (vector *)lat; - vector *buf = &buffer[0]; - iScalar *tbuf =(iScalar *) &buffer[0]; - for(int w=0;w(lat, osites, ret_p, w); w += 12; } + while (w + 4 <= words) { sumD_gpu_reduce_words< 4>(lat, osites, ret_p, w); w += 4; } + while (w < words) { sumD_gpu_reduce_words< 1>(lat, osites, ret_p, w); w += 1; } - accelerator_for(ss,osites,1,{ - buf[ss] = dat[ss*words+w]; - }); - - ret_p[w] = sumD_gpu_small(tbuf,osites); - } return ret; }