From d358954a84ff6d3da4291136d24b3eb191ce16cf Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 18 May 2026 21:08:10 -0400 Subject: [PATCH] sumD_gpu_direct: shared-memory lane reduction with acceleratorThreads(1) Set acceleratorThreads to 1 before the extraction kernel so that dim3(nsimd,1,1) blocks give exactly one site group per block and __shared__ sobjD smem[nsimd] is correctly sized without depending on the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane) indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1] and writes one sobjD per site. CUB then reduces osites elements instead of osites*nsimd, reducing both store traffic and CUB work by Nsimd. acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize. Co-Authored-By: Claude Sonnet 4.6 --- Grid/lattice/Lattice_reduction_gpu_cub.h | 40 +++++++++++++++--------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu_cub.h b/Grid/lattice/Lattice_reduction_gpu_cub.h index afea7571..274371f1 100644 --- a/Grid/lattice/Lattice_reduction_gpu_cub.h +++ b/Grid/lattice/Lattice_reduction_gpu_cub.h @@ -63,24 +63,36 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; - const Integer nsimd = vobj::Nsimd(); - const Integer nlanes = osites * nsimd; + constexpr Integer nsimd = vobj::Nsimd(); - deviceVector per_lane(nlanes); - sobjD *per_lane_p = &per_lane[0]; + deviceVector per_site(osites); + sobjD *per_site_p = &per_site[0]; + + // Set acceleratorThreads=1 so the block is dim3(nsimd,1,1): one site per block. + // smem[nsimd] is then exactly sized; no other site group competes for the same slots. + // Restore after the (synchronous) accelerator_for. + uint32_t saved_threads = acceleratorThreads(); + acceleratorThreads(1); #ifdef GRID_REDUCTION_TIMING RealD t_for = -usecond(); #endif - accelerator_for(idx, nlanes, 1, { - Integer ss = idx / nsimd; - Integer lane = idx % nsimd; - sobj tmp = extractLane(lane, lat[ss]); + accelerator_for(ss, osites, nsimd, { + int lane = acceleratorSIMTlane(nsimd); + __shared__ sobjD smem[nsimd]; + sobj tmp = extractLane(lane, lat[ss]); sobjD tmpD; tmpD = tmp; - per_lane_p[idx] = tmpD; + smem[lane] = tmpD; + acceleratorSynchronise(); + if (lane == 0) { + sobjD acc = smem[0]; + for (int l = 1; l < nsimd; l++) acc = acc + smem[l]; + per_site_p[ss] = acc; + } }); + // accelerator_for already issued accelerator_barrier(); restore thread count now. + acceleratorThreads(saved_threads); #ifdef GRID_REDUCTION_TIMING - accelerator_barrier(); t_for += usecond(); #endif @@ -90,8 +102,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os size_t temp_bytes = 0; gpuError_t gpuErr; - gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out, - (int)nlanes, gpucub::Sum(), zero, computeStream); + gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out, + (int)osites, gpucub::Sum(), zero, computeStream); if (gpuErr != gpuSuccess) { std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: " << gpuErr << std::endl; @@ -103,8 +115,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os #ifdef GRID_REDUCTION_TIMING RealD t_cub = -usecond(); #endif - gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out, - (int)nlanes, gpucub::Sum(), zero, computeStream); + gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out, + (int)osites, gpucub::Sum(), zero, computeStream); if (gpuErr != gpuSuccess) { std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: " << gpuErr << std::endl;