1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Faster reductions, important on single node staggered

This commit is contained in:
Azusa Yamaguchi 2018-04-26 10:03:57 +01:00
parent 213f8db6a2
commit eac6ec4b5e
2 changed files with 64 additions and 20 deletions

View File

@ -244,19 +244,11 @@ namespace Grid {
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; return axpy_norm_fast(ret,a,x,y);
conformable(ret,x);
conformable(x,y);
axpy(ret,a,x,y);
return norm2(ret);
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; return axpby_norm_fast(ret,a,b,x,y);
conformable(ret,x);
conformable(x,y);
axpby(ret,a,b,x,y);
return norm2(ret); // FIXME implement parallel norm in ss loop
} }
} }

View File

@ -33,7 +33,7 @@ namespace Grid {
// Deterministic Reduction operations // Deterministic Reduction operations
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg); auto nrm = innerProduct(arg,arg);
return std::real(nrm); return std::real(nrm);
} }
@ -43,11 +43,11 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
scalar_type nrm;
GridBase *grid = left._grid; GridBase *grid = left._grid;
const int pad = 8;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()); scalar_type nrm;
std::vector<scalar_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff; int nwork, mywork, myoff;
@ -57,18 +57,70 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
for(int ss=myoff;ss<mywork+myoff; ss++){ for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]); vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
} }
sumarray[thr]=TensorRemove(vnrm) ; // All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
vstream(sumarray[thr*pad],Reduce(TensorRemove(vnrm))) ;
} }
vector_type vvnrm; vvnrm=zero; // sum across threads nrm=0.0;
for(int i=0;i<grid->SumArraySize();i++){ for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i]; nrm = nrm+sumarray[i*pad];
} }
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm); right._grid->GlobalSum(nrm);
return nrm; return nrm;
} }
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
Vector<RealD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object