mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
First cut at faster GPU slice sum via atomics
This commit is contained in:
parent
4ca1bf7cca
commit
8b91b61b61
126
Grid/lattice/Lattice_slice_gpu.h
Normal file
126
Grid/lattice/Lattice_slice_gpu.h
Normal file
@ -0,0 +1,126 @@
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// If NOT CUDA or HIP -- we should provide
|
||||
// -- atomicAdd(float *,float)
|
||||
// -- atomicAdd(double *,double)
|
||||
//
|
||||
// Augment CUDA with complex atomics
|
||||
#if !defined(GRID_HIP) || !defined(GRID_CUDA)
|
||||
inline void atomicAdd(float *acc,float elem)
|
||||
{
|
||||
*acc += elem;
|
||||
}
|
||||
inline void atomicAdd(double *acc,double elem)
|
||||
{
|
||||
*acc += elem;
|
||||
}
|
||||
#endif
|
||||
inline void atomicAdd(ComplexD *accum,ComplexD & elem)
|
||||
{
|
||||
double *a_p = (double *)accum;
|
||||
double *e_p = (double *)&elem;
|
||||
for(int w=0;w<2;w++){
|
||||
atomicAdd(&a_p[w],e_p[w]);
|
||||
}
|
||||
}
|
||||
inline void atomicAdd(ComplexF *accum,ComplexF & elem)
|
||||
{
|
||||
float *a_p = (float *)accum;
|
||||
float *e_p = (float *)&elem;
|
||||
for(int w=0;w<2;w++){
|
||||
atomicAdd(&a_p[w],e_p[w]);
|
||||
}
|
||||
}
|
||||
// Augment CUDA with vobj atomics
|
||||
template<class vobj> accelerator_inline void atomicAdd(vobj *accum, vobj & elem)
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
scalar_type *a_p= (scalar_type *)accum;
|
||||
scalar_type *e_p= (scalar_type *)& elem;
|
||||
for(int w=0;w<vobj::Nsimd();w++){
|
||||
atomicAdd(&a_p[w],e_p[w]);
|
||||
}
|
||||
}
|
||||
// Atomics based slice sum
|
||||
template<class vobj> inline void sliceSumGpu(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_object::scalar_type scalar_type;
|
||||
GridBase *grid = Data.Grid();
|
||||
assert(grid!=NULL);
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
assert(orthogdim >= 0);
|
||||
assert(orthogdim < Nd);
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
// Move to device memory and copy in / out
|
||||
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node
|
||||
for(int r=0;r<rd;r++){
|
||||
lvSum[r]=Zero();
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
// sum over reduced dimension planes, breaking out orthog dir
|
||||
// Parallel over orthog direction
|
||||
autoView( Data_v, Data, AcceleratorRead);
|
||||
auto lvSum_p=&lvSum[0];
|
||||
int ostride = grid->_ostride[orthogdim];
|
||||
accelerator_for( ree,rd*e1*e2,1, {
|
||||
int b = ree%e2;
|
||||
int re= ree/e2;
|
||||
int n=re%e1;
|
||||
int r=re/e1;
|
||||
int so=r*ostride;
|
||||
int ss=so+n*stride+b;
|
||||
atomicAdd(&lvSum_p[r],Data_v[ss]);
|
||||
});
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
Coordinate icoor(Nd);
|
||||
|
||||
for(int rt=0;rt<rd;rt++){
|
||||
|
||||
extract(lvSum[rt],extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx =rt+icoor[orthogdim]*rd;
|
||||
|
||||
lsSum[ldx]=lsSum[ldx]+extracted[idx];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// sum over nodes.
|
||||
for(int t=0;t<fd;t++){
|
||||
int pt = t/ld; // processor plane
|
||||
int lt = t%ld;
|
||||
if ( pt == grid->_processor_coor[orthogdim] ) {
|
||||
result[t]=lsSum[lt];
|
||||
} else {
|
||||
result[t]=Zero();
|
||||
}
|
||||
|
||||
}
|
||||
scalar_type * ptr = (scalar_type *) &result[0];
|
||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||
grid->GlobalSumVector(ptr, words);
|
||||
}
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
73
tests/core/Test_slicesum.cc
Normal file
73
tests/core/Test_slicesum.cc
Normal file
@ -0,0 +1,73 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_poisson_fft.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/lattice/Lattice_slice_gpu.h>
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
int N=16;
|
||||
std::vector<int> latt_size ({N,N,N,N});
|
||||
std::vector<int> simd_layout({vComplexD::Nsimd(),1,1,1});
|
||||
std::vector<int> mpi_layout ({1,1,1,1});
|
||||
|
||||
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
|
||||
|
||||
LatticeComplexD rn(&GRID);
|
||||
|
||||
GridParallelRNG RNG(&GRID);
|
||||
RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||
gaussian(RNG,rn);
|
||||
|
||||
std::vector<TComplex> reduced_ref;
|
||||
std::vector<TComplex> reduced_gpu;
|
||||
for(int d=0;d<4;d++){
|
||||
{
|
||||
RealD t=-usecond();
|
||||
sliceSum(rn,reduced_ref,d);
|
||||
t+=usecond();
|
||||
std::cout << " sliceSum took "<<t<<" usecs"<<std::endl;
|
||||
}
|
||||
{
|
||||
RealD t=-usecond();
|
||||
sliceSumGpu(rn,reduced_gpu,d);
|
||||
t+=usecond();
|
||||
std::cout << " sliceSumGpu took "<<t<<" usecs"<<std::endl;
|
||||
}
|
||||
for(int t=0;t<reduced_ref.size();t++){
|
||||
std::cout << t<<" ref "<< reduced_ref[t] <<" opt " << reduced_gpu[t] << " diff "<<reduced_ref[t]-reduced_gpu[t]<<std::endl;
|
||||
TComplex diff = reduced_ref[t]-reduced_gpu[t];
|
||||
assert(abs(TensorRemove(diff)) < 1e-8 );
|
||||
}
|
||||
}
|
||||
Grid_finalize();
|
||||
}
|
Loading…
Reference in New Issue
Block a user