mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Faster reductions, important on single node staggered
This commit is contained in:
parent
213f8db6a2
commit
eac6ec4b5e
@ -244,19 +244,11 @@ namespace Grid {
|
|||||||
|
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
ret.checkerboard = x.checkerboard;
|
return axpy_norm_fast(ret,a,x,y);
|
||||||
conformable(ret,x);
|
|
||||||
conformable(x,y);
|
|
||||||
axpy(ret,a,x,y);
|
|
||||||
return norm2(ret);
|
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
ret.checkerboard = x.checkerboard;
|
return axpby_norm_fast(ret,a,b,x,y);
|
||||||
conformable(ret,x);
|
|
||||||
conformable(x,y);
|
|
||||||
axpby(ret,a,b,x,y);
|
|
||||||
return norm2(ret); // FIXME implement parallel norm in ss loop
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
|||||||
// Deterministic Reduction operations
|
// Deterministic Reduction operations
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
ComplexD nrm = innerProduct(arg,arg);
|
auto nrm = innerProduct(arg,arg);
|
||||||
return std::real(nrm);
|
return std::real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,11 +43,11 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
scalar_type nrm;
|
|
||||||
|
|
||||||
GridBase *grid = left._grid;
|
GridBase *grid = left._grid;
|
||||||
|
const int pad = 8;
|
||||||
|
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
scalar_type nrm;
|
||||||
|
std::vector<scalar_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()*pad);
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
@ -57,18 +57,70 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
||||||
}
|
}
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
// All threads sum across SIMD; reduce serial work at end
|
||||||
|
// one write per cacheline with streaming store
|
||||||
|
vstream(sumarray[thr*pad],Reduce(TensorRemove(vnrm))) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
nrm=0.0;
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vvnrm = vvnrm+sumarray[i];
|
nrm = nrm+sumarray[i*pad];
|
||||||
}
|
}
|
||||||
nrm = Reduce(vvnrm);// sum across simd
|
|
||||||
right._grid->GlobalSum(nrm);
|
right._grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// Fast axpby_norm
|
||||||
|
// z = a x + b y
|
||||||
|
// return norm z
|
||||||
|
/////////////////////////
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
sobj one(1.0);
|
||||||
|
return axpby_norm_fast(z,a,one,x,y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
const int pad = 8;
|
||||||
|
z.checkerboard = x.checkerboard;
|
||||||
|
conformable(z,x);
|
||||||
|
conformable(x,y);
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
RealD nrm;
|
||||||
|
|
||||||
|
GridBase *grid = x._grid;
|
||||||
|
|
||||||
|
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
||||||
|
|
||||||
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
|
int nwork, mywork, myoff;
|
||||||
|
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
|
// private to thread; sub summation
|
||||||
|
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
|
||||||
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
|
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||||
|
vnrm = vnrm + innerProductD(tmp,tmp);
|
||||||
|
vstream(z._odata[ss],tmp);
|
||||||
|
}
|
||||||
|
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
nrm = 0.0; // sum across threads; linear in thread count but fast
|
||||||
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
|
nrm = nrm+sumarray[i*pad];
|
||||||
|
}
|
||||||
|
z._grid->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Op,class T1>
|
template<class Op,class T1>
|
||||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||||
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
|
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
|
||||||
|
Loading…
Reference in New Issue
Block a user