mirror of
https://github.com/paboyle/Grid.git
synced 2026-06-21 11:18:15 +01:00
Reinstate large/small dispatch in CUB reduction path; radix-4 word-bundle for large types
rocPRIM's DeviceReduce requires warpSize(64) threads each holding one element in shared memory, so sizeof(T)*64 must fit in sharedMemPerBlock. LatticePropagator::scalar_objectD is 2304 bytes (64*2304 = 147 KB), exceeding the budget and triggering a compile-time static_assert in limit_block_size. Introduce sumD_gpu_direct (the original direct-CUB path, safe for small types) and a new sumD_gpu_large that groups the vobj's vector_type words in bundles of 4, reducing each bundle as WordBundle4<scalarD> (64 bytes, 64*64 = 4 KB — always within budget). If words % 4 != 0, the final partial bundle is zero-padded. sumD_gpu dispatches at compile time via if constexpr on sizeof(sobjD) > 512. For LatticePropagator (144 words) this gives 36 CUB launches instead of 144. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -36,21 +36,47 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Unified lattice reduction using CUB (CUDA/HIP) and sycl::reduction (SYCL).
|
// Unified lattice reduction using CUB (CUDA/HIP) and sycl::reduction (SYCL).
|
||||||
//
|
//
|
||||||
// Strategy: one accelerator_for pass per site to extract SIMD lanes and promote to sobjD,
|
// CUDA/HIP: one accelerator_for pass per site to extract SIMD lanes and promote to sobjD,
|
||||||
// then a single library reduce over the sobjD array. No small/large split is needed:
|
// then CUB/hipCUB DeviceReduce::Reduce over the resulting array.
|
||||||
// CUB DeviceReduce::Reduce and sycl::reduction both handle arbitrary object sizes by
|
//
|
||||||
// tuning block occupancy internally.
|
// rocPRIM's DeviceReduce requires warpSize(64) threads per block, each holding one element
|
||||||
|
// in shared memory: sizeof(T)*64 must fit in sharedMemPerBlock. Large QCD objects such as
|
||||||
|
// LatticePropagator (sobjD = 2304 bytes, 64*2304 = 147 KB) exceed this budget.
|
||||||
|
//
|
||||||
|
// For those types sumD_gpu_large groups the vobj's vector_type words in bundles of 4,
|
||||||
|
// reducing each bundle as a WordBundle4<scalarD> (64 bytes, 64*64 = 4 KB — always safe).
|
||||||
|
// Words that do not fill a complete bundle are zero-padded.
|
||||||
|
//
|
||||||
|
// SYCL: sycl::reduction handles any type size through the runtime, so one path suffices.
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
|
|
||||||
|
// Bundles 4 scalar_typeD values for the radix-4 large-type reduction path.
|
||||||
|
// sizeof = 4 * sizeof(scalarD) <= 64 bytes; 64 * 64 = 4096 bytes, safely within
|
||||||
|
// rocPRIM's shared-memory budget on all supported devices.
|
||||||
|
template<class scalarD>
|
||||||
|
struct WordBundle4 {
|
||||||
|
scalarD w[4];
|
||||||
|
accelerator_inline WordBundle4 operator+(const WordBundle4 &rhs) const {
|
||||||
|
WordBundle4 r;
|
||||||
|
r.w[0] = w[0] + rhs.w[0];
|
||||||
|
r.w[1] = w[1] + rhs.w[1];
|
||||||
|
r.w[2] = w[2] + rhs.w[2];
|
||||||
|
r.w[3] = w[3] + rhs.w[3];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Direct CUB reduction on the full scalar_objectD.
|
||||||
|
// Only safe when sizeof(sobjD)*64 <= device sharedMemPerBlock.
|
||||||
|
// Do not call directly for large composite types (e.g. LatticePropagator).
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
|
||||||
// Per-site: sum SIMD lanes (Reduce) and promote to double precision.
|
|
||||||
deviceVector<sobjD> per_site(osites);
|
deviceVector<sobjD> per_site(osites);
|
||||||
sobjD *per_site_p = &per_site[0];
|
sobjD *per_site_p = &per_site[0];
|
||||||
|
|
||||||
@@ -60,7 +86,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
per_site_p[ss] = tmpD;
|
per_site_p[ss] = tmpD;
|
||||||
});
|
});
|
||||||
|
|
||||||
// CUB global reduction over the sobjD array.
|
|
||||||
sobjD zero; zeroit(zero);
|
sobjD zero; zeroit(zero);
|
||||||
sobjD *d_out = static_cast<sobjD *>(acceleratorAllocDevice(sizeof(sobjD)));
|
sobjD *d_out = static_cast<sobjD *>(acceleratorAllocDevice(sizeof(sobjD)));
|
||||||
void *d_temp = nullptr;
|
void *d_temp = nullptr;
|
||||||
@@ -70,7 +95,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
|
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
|
||||||
(int)osites, gpucub::Sum(), zero, computeStream);
|
(int)osites, gpucub::Sum(), zero, computeStream);
|
||||||
if (gpuErr != gpuSuccess) {
|
if (gpuErr != gpuSuccess) {
|
||||||
std::cout << GridLogError << "Lattice_reduction_gpu_cub.h: DeviceReduce size query failed: "
|
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
|
||||||
<< gpuErr << std::endl;
|
<< gpuErr << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@@ -80,7 +105,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
|
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
|
||||||
(int)osites, gpucub::Sum(), zero, computeStream);
|
(int)osites, gpucub::Sum(), zero, computeStream);
|
||||||
if (gpuErr != gpuSuccess) {
|
if (gpuErr != gpuSuccess) {
|
||||||
std::cout << GridLogError << "Lattice_reduction_gpu_cub.h: DeviceReduce failed: "
|
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
|
||||||
<< gpuErr << std::endl;
|
<< gpuErr << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@@ -94,15 +119,120 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sumD_gpu_small and sumD_gpu_large are preserved as aliases for API compatibility.
|
// Radix-4 word-bundle path for types too large for the direct CUB path.
|
||||||
|
// Treats vobj as words of vector_type; groups them in bundles of 4 and reduces
|
||||||
|
// each bundle as a WordBundle4<scalarD>. If words % 4 != 0, the final partial
|
||||||
|
// bundle is zero-padded so all unused slots contribute zero to the sum.
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
return sumD_gpu(lat, osites);
|
typedef typename vobj::vector_type vector;
|
||||||
|
typedef typename vobj::scalar_typeD scalarD;
|
||||||
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
using R4 = WordBundle4<scalarD>;
|
||||||
|
|
||||||
|
const int words = sizeof(vobj) / sizeof(vector);
|
||||||
|
const int nfull = words / 4;
|
||||||
|
const int rem = words % 4;
|
||||||
|
|
||||||
|
sobjD ret; zeroit(ret);
|
||||||
|
scalarD *ret_p = (scalarD *)&ret;
|
||||||
|
|
||||||
|
iScalar<vector> *idat = (iScalar<vector> *)lat;
|
||||||
|
deviceVector<R4> buf(osites);
|
||||||
|
R4 *buf_p = &buf[0];
|
||||||
|
|
||||||
|
R4 zero4;
|
||||||
|
zero4.w[0] = zero4.w[1] = zero4.w[2] = zero4.w[3] = Zero();
|
||||||
|
|
||||||
|
R4 *d_out = static_cast<R4 *>(acceleratorAllocDevice(sizeof(R4)));
|
||||||
|
void *d_temp = nullptr;
|
||||||
|
size_t temp_bytes = 0;
|
||||||
|
|
||||||
|
// Probe workspace size once — type R4 and count osites are fixed across all groups.
|
||||||
|
gpuError_t gpuErr;
|
||||||
|
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, buf_p, d_out,
|
||||||
|
(int)osites, gpucub::Sum(), zero4, computeStream);
|
||||||
|
if (gpuErr != gpuSuccess) {
|
||||||
|
std::cout << GridLogError << "sumD_gpu_large: DeviceReduce size query failed: "
|
||||||
|
<< gpuErr << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
d_temp = acceleratorAllocDevice(temp_bytes);
|
||||||
|
|
||||||
|
// Full groups of 4 words.
|
||||||
|
for (int g = 0; g < nfull; g++) {
|
||||||
|
int base = 4 * g;
|
||||||
|
accelerator_for(ss, osites, 1, {
|
||||||
|
R4 r4;
|
||||||
|
r4.w[0] = TensorRemove(Reduce(idat[ss * words + base ]));
|
||||||
|
r4.w[1] = TensorRemove(Reduce(idat[ss * words + base + 1]));
|
||||||
|
r4.w[2] = TensorRemove(Reduce(idat[ss * words + base + 2]));
|
||||||
|
r4.w[3] = TensorRemove(Reduce(idat[ss * words + base + 3]));
|
||||||
|
buf_p[ss] = r4;
|
||||||
|
});
|
||||||
|
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, buf_p, d_out,
|
||||||
|
(int)osites, gpucub::Sum(), zero4, computeStream);
|
||||||
|
if (gpuErr != gpuSuccess) {
|
||||||
|
std::cout << GridLogError << "sumD_gpu_large: DeviceReduce failed (group "
|
||||||
|
<< g << "): " << gpuErr << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
accelerator_barrier();
|
||||||
|
R4 group_result;
|
||||||
|
acceleratorCopyFromDevice(d_out, &group_result, sizeof(R4));
|
||||||
|
ret_p[base ] = group_result.w[0];
|
||||||
|
ret_p[base + 1] = group_result.w[1];
|
||||||
|
ret_p[base + 2] = group_result.w[2];
|
||||||
|
ret_p[base + 3] = group_result.w[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Partial last group: zero-pad unused slots so they contribute nothing to the sum.
|
||||||
|
if (rem > 0) {
|
||||||
|
int base = 4 * nfull;
|
||||||
|
accelerator_for(ss, osites, 1, {
|
||||||
|
R4 r4;
|
||||||
|
r4.w[0] = r4.w[1] = r4.w[2] = r4.w[3] = Zero();
|
||||||
|
for (int k = 0; k < rem; k++)
|
||||||
|
r4.w[k] = TensorRemove(Reduce(idat[ss * words + base + k]));
|
||||||
|
buf_p[ss] = r4;
|
||||||
|
});
|
||||||
|
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, buf_p, d_out,
|
||||||
|
(int)osites, gpucub::Sum(), zero4, computeStream);
|
||||||
|
if (gpuErr != gpuSuccess) {
|
||||||
|
std::cout << GridLogError << "sumD_gpu_large: DeviceReduce failed (partial group): "
|
||||||
|
<< gpuErr << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
accelerator_barrier();
|
||||||
|
R4 partial_result;
|
||||||
|
acceleratorCopyFromDevice(d_out, &partial_result, sizeof(R4));
|
||||||
|
for (int k = 0; k < rem; k++)
|
||||||
|
ret_p[4 * nfull + k] = partial_result.w[k];
|
||||||
|
}
|
||||||
|
|
||||||
|
acceleratorFreeDevice(d_temp);
|
||||||
|
acceleratorFreeDevice(d_out);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dispatch: direct CUB path for types that fit in the shared-memory budget,
|
||||||
|
// radix-4 word-bundle path for larger types.
|
||||||
|
// Threshold 512 bytes: 64 * 512 = 32768 bytes, within rocPRIM's
|
||||||
|
// ROCPRIM_SHARED_MEMORY_MAX on all supported devices.
|
||||||
|
template<class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
if constexpr (sizeof(sobjD) > 512) {
|
||||||
|
return sumD_gpu_large(lat, osites);
|
||||||
|
} else {
|
||||||
|
return sumD_gpu_direct(lat, osites);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
return sumD_gpu(lat, osites);
|
return sumD_gpu(lat, osites);
|
||||||
}
|
}
|
||||||
@@ -119,7 +249,10 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
return sum_gpu(lat, osites);
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
sobj result;
|
||||||
|
result = sumD_gpu_large(lat, osites);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // GRID_CUDA || GRID_HIP
|
#endif // GRID_CUDA || GRID_HIP
|
||||||
|
|||||||
Reference in New Issue
Block a user