From aee00bdfb5e1bfd01a73d06a612afa3e258f5e50 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 18 May 2026 16:21:50 -0400 Subject: [PATCH] sumD_gpu_direct: one thread per SIMD lane using extractLane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces one thread per outer site calling Reduce() (sequential Nsimd-wide loop) with one thread per lane calling extractLane() — O(1) per thread. CUB now reduces over osites*Nsimd elements. Avoids serial lane reduction but leaves the per-lane sobjD store stride as a known remaining concern. Co-Authored-By: Claude Sonnet 4.6 --- Grid/lattice/Lattice_reduction_gpu_cub.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu_cub.h b/Grid/lattice/Lattice_reduction_gpu_cub.h index 6a732de4..afea7571 100644 --- a/Grid/lattice/Lattice_reduction_gpu_cub.h +++ b/Grid/lattice/Lattice_reduction_gpu_cub.h @@ -63,16 +63,21 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; - deviceVector per_site(osites); - sobjD *per_site_p = &per_site[0]; + const Integer nsimd = vobj::Nsimd(); + const Integer nlanes = osites * nsimd; + + deviceVector per_lane(nlanes); + sobjD *per_lane_p = &per_lane[0]; #ifdef GRID_REDUCTION_TIMING RealD t_for = -usecond(); #endif - accelerator_for(ss, osites, 1, { - sobj tmp = Reduce(lat[ss]); + accelerator_for(idx, nlanes, 1, { + Integer ss = idx / nsimd; + Integer lane = idx % nsimd; + sobj tmp = extractLane(lane, lat[ss]); sobjD tmpD; tmpD = tmp; - per_site_p[ss] = tmpD; + per_lane_p[idx] = tmpD; }); #ifdef GRID_REDUCTION_TIMING accelerator_barrier(); @@ -85,8 +90,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os size_t temp_bytes = 0; gpuError_t gpuErr; - gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out, - (int)osites, gpucub::Sum(), zero, computeStream); + gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out, + (int)nlanes, gpucub::Sum(), zero, computeStream); if (gpuErr != gpuSuccess) { std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: " << gpuErr << std::endl; @@ -98,8 +103,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os #ifdef GRID_REDUCTION_TIMING RealD t_cub = -usecond(); #endif - gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out, - (int)osites, gpucub::Sum(), zero, computeStream); + gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out, + (int)nlanes, gpucub::Sum(), zero, computeStream); if (gpuErr != gpuSuccess) { std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: " << gpuErr << std::endl;