1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

refactor slicesum: slicesum uses GPU version by default now

This commit is contained in:
dbollweg
2024-02-09 13:02:28 -05:00
parent 1514b4f137
commit 9514035b87
5 changed files with 289 additions and 324 deletions

View File

@ -1,5 +1,79 @@
#include <Grid/Grid.h>
template<class vobj> inline void sliceSumCPU(const Grid::Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
using namespace Grid;
///////////////////////////////////////////////////////
// FIXME precision promoted summation
// may be important for correlation functions
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid();
assert(grid!=NULL);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
int ostride=grid->_ostride[orthogdim];
//Reduce Data down to lvSum
sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
extract(lvSum[rt],extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx];
}
}
// sum over nodes.
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
result[t]=lsSum[lt];
} else {
result[t]=Zero();
}
}
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
}
int main (int argc, char ** argv) {
@ -26,7 +100,7 @@ int main (int argc, char ** argv) {
//warmup
for (int sweeps = 0; sweeps < 5; sweeps++) {
reduction_result = sliceSumGpu(test_data,0);
reduction_result = sliceSum(test_data,0);
}
int trace_id = traceStart("sliceSum benchmark");
@ -35,23 +109,23 @@ int main (int argc, char ** argv) {
RealD t=-usecond();
tracePush("sliceSum");
sliceSum(test_data,reduction_reference,i);
sliceSumCPU(test_data,reduction_reference,i);
tracePop("sliceSum");
t+=usecond();
std::cout << GridLogMessage << " sliceSum took "<<t<<" usecs"<<std::endl;
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
RealD tgpu=-usecond();
tracePush("sliceSumGpu");
reduction_result = sliceSumGpu(test_data,i);
reduction_result = sliceSum(test_data,i);
tracePop("sliceSumGpu");
tgpu+=usecond();
std::cout << GridLogMessage <<" sliceSumGpu took "<<tgpu<<" usecs"<<std::endl;
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
for(int t=0;t<reduction_reference.size();t++) {