1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-05-28 21:14:16 +01:00

sumD_gpu_direct: shared-memory lane reduction with acceleratorThreads(1)

Set acceleratorThreads to 1 before the extraction kernel so that
dim3(nsimd,1,1) blocks give exactly one site group per block and
__shared__ sobjD smem[nsimd] is correctly sized without depending on
the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane)
indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1]
and writes one sobjD per site. CUB then reduces osites elements instead
of osites*nsimd, reducing both store traffic and CUB work by Nsimd.
acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Boyle
2026-05-18 21:08:10 -04:00
parent 747c167658
commit 843d6497b2
+26 -14
View File
@@ -63,24 +63,36 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
const Integer nsimd = vobj::Nsimd();
const Integer nlanes = osites * nsimd;
constexpr Integer nsimd = vobj::Nsimd();
deviceVector<sobjD> per_lane(nlanes);
sobjD *per_lane_p = &per_lane[0];
deviceVector<sobjD> per_site(osites);
sobjD *per_site_p = &per_site[0];
// Set acceleratorThreads=1 so the block is dim3(nsimd,1,1): one site per block.
// smem[nsimd] is then exactly sized; no other site group competes for the same slots.
// Restore after the (synchronous) accelerator_for.
uint32_t saved_threads = acceleratorThreads();
acceleratorThreads(1);
#ifdef GRID_REDUCTION_TIMING
RealD t_for = -usecond();
#endif
accelerator_for(idx, nlanes, 1, {
Integer ss = idx / nsimd;
Integer lane = idx % nsimd;
sobj tmp = extractLane(lane, lat[ss]);
accelerator_for(ss, osites, nsimd, {
int lane = acceleratorSIMTlane(nsimd);
__shared__ sobjD smem[nsimd];
sobj tmp = extractLane(lane, lat[ss]);
sobjD tmpD; tmpD = tmp;
per_lane_p[idx] = tmpD;
smem[lane] = tmpD;
acceleratorSynchronise();
if (lane == 0) {
sobjD acc = smem[0];
for (int l = 1; l < nsimd; l++) acc = acc + smem[l];
per_site_p[ss] = acc;
}
});
// accelerator_for already issued accelerator_barrier(); restore thread count now.
acceleratorThreads(saved_threads);
#ifdef GRID_REDUCTION_TIMING
accelerator_barrier();
t_for += usecond();
#endif
@@ -90,8 +102,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
size_t temp_bytes = 0;
gpuError_t gpuErr;
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
(int)nlanes, gpucub::Sum(), zero, computeStream);
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
(int)osites, gpucub::Sum(), zero, computeStream);
if (gpuErr != gpuSuccess) {
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
<< gpuErr << std::endl;
@@ -103,8 +115,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
#ifdef GRID_REDUCTION_TIMING
RealD t_cub = -usecond();
#endif
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
(int)nlanes, gpucub::Sum(), zero, computeStream);
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
(int)osites, gpucub::Sum(), zero, computeStream);
if (gpuErr != gpuSuccess) {
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
<< gpuErr << std::endl;