mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merge 461cd045c6
into 9a1ad6a5eb
This commit is contained in:
commit
e5525a55da
@ -1,5 +1,5 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <type_traits>
|
|
||||||
#if defined(GRID_CUDA)
|
#if defined(GRID_CUDA)
|
||||||
|
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
|
||||||
|
#if defined(GRID_SYCL)
|
||||||
|
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
|
{
|
||||||
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
|
vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
|
||||||
|
vobj vobj_zero;
|
||||||
|
zeroit(vobj_zero);
|
||||||
|
for (int r = 0; r<rd; r++) {
|
||||||
|
mysum[r] = vobj_zero;
|
||||||
|
}
|
||||||
|
|
||||||
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
|
// autoView(Data_v, Data, AcceleratorRead);
|
||||||
|
|
||||||
|
//prepare reduction buffer
|
||||||
|
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
||||||
|
|
||||||
|
int n = s / e2;
|
||||||
|
int b = s % e2;
|
||||||
|
int so=r*ostride; // base offset for start of plane
|
||||||
|
int ss= so+n*stride+b;
|
||||||
|
|
||||||
|
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
|
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||||
|
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||||
|
Reduction,
|
||||||
|
[=](cl::sycl::id<1> item, auto &sum) {
|
||||||
|
auto s = item[0];
|
||||||
|
sum += rb_p[r*subvol_size+s];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
theGridAccelerator->wait();
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
lvSum[r] = mysum[r];
|
||||||
|
}
|
||||||
|
free(mysum,*theGridAccelerator);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
const int osites = rd*e1*e2;
|
const int osites = rd*e1*e2;
|
||||||
@ -106,7 +159,11 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
|
|||||||
buf[ss] = dat[ss*words+w];
|
buf[ss] = dat[ss*words+w];
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
#elif defined(GRID_SYCL)
|
||||||
|
sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||||
{
|
{
|
||||||
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
|
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||||
if constexpr (sizeof(vobj) <= 256) {
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
|
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#elif defined (GRID_SYCL)
|
||||||
|
sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
|
||||||
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
size_t subvol_size = e1*e2;
|
|
||||||
|
|
||||||
vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
|
|
||||||
vobj vobj_zero;
|
|
||||||
zeroit(vobj_zero);
|
|
||||||
|
|
||||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
|
||||||
|
|
||||||
auto rb_p = &reduction_buffer[0];
|
|
||||||
|
|
||||||
autoView(Data_v, Data, AcceleratorRead);
|
|
||||||
|
|
||||||
//prepare reduction buffer
|
|
||||||
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
|
||||||
|
|
||||||
int n = s / e2;
|
|
||||||
int b = s % e2;
|
|
||||||
int so=r*ostride; // base offset for start of plane
|
|
||||||
int ss= so+n*stride+b;
|
|
||||||
|
|
||||||
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
|
||||||
mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
|
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
|
||||||
auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
|
|
||||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
|
||||||
Reduction,
|
|
||||||
[=](cl::sycl::id<1> item, auto &sum) {
|
|
||||||
auto s = item[0];
|
|
||||||
sum += rb_p[r*subvol_size+s];
|
|
||||||
});
|
|
||||||
});
|
|
||||||
theGridAccelerator->wait();
|
|
||||||
lvSum[r] = mysum[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
free(mysum,*theGridAccelerator);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
{
|
{
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
|
|||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
|
||||||
#elif defined(GRID_SYCL)
|
|
||||||
|
|
||||||
sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
Loading…
Reference in New Issue
Block a user