1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-16 23:07:05 +01:00

Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs

This commit is contained in:
dbollweg
2024-02-16 13:15:02 -05:00
parent b5659d106e
commit 6f3455900e
2 changed files with 65 additions and 22 deletions

View File

@ -1,4 +1,5 @@
#pragma once
#include <type_traits>
#if defined(GRID_CUDA)
#include <cub/cub.cuh>
@ -26,20 +27,16 @@
NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
{
typedef typename vobj::scalar_object sobj;
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
size_t subvol_size = e1*e2;
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
vobj zero_init;
zeroit(zero_init);
vobj vobj_zero; //Need to provide initial value for reduction operation
zeroit(vobj_zero);
void *temp_storage_array = NULL;
size_t temp_storage_bytes = 0;
vobj *d_out;
@ -71,8 +68,8 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data
exit(EXIT_FAILURE);
}
//determine temp_storage_array size
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), vobj_zero, computeStream);
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
@ -85,11 +82,9 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data
exit(EXIT_FAILURE);
}
autoView( Data_v, Data, AcceleratorRead);
//prepare buffer for reduction
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
int n = s / e2;
@ -97,12 +92,12 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
//issue segmented reductions in computeStream
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), vobj_zero, computeStream);
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
@ -119,6 +114,48 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data
}
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2;
Vector<vector>buffer(osites);
vector *dat = (vector *)Data;
vector *buf = &buffer[0];
Vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r];
}
}
}
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead);
#if defined(GRID_CUDA)
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#elif defined (GRID_HIP) //hipcub cannot deal with large vobjs that don't fit in shared memory, therefore separate into _small/_large.
if constexpr (sizeof(vobj) <= 256) {
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
else {
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
#endif
}
#endif