Faster reductions, important on single node staggered

2025-11-19 22:19:32 +00:00 · 2018-04-26 10:03:57 +01:00
parent 213f8db6a2
commit eac6ec4b5e
2 changed files with 64 additions and 20 deletions
--- a/lib/lattice/Lattice_arith.h
+++ b/lib/lattice/Lattice_arith.h
@@ -244,19 +244,11 @@ namespace Grid {
  template<class sobj,class vobj> strong_inline
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
-    ret.checkerboard = x.checkerboard;
+    return axpy_norm_fast(ret,a,x,y);
    conformable(ret,x);
    conformable(x,y);
    axpy(ret,a,x,y);
    return norm2(ret);
  }
  template<class sobj,class vobj> strong_inline
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
-    ret.checkerboard = x.checkerboard;
+    return axpby_norm_fast(ret,a,b,x,y);
    conformable(ret,x);
    conformable(x,y);
    axpby(ret,a,b,x,y);
    return norm2(ret); // FIXME implement parallel norm in ss loop
  }
 }
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -33,7 +33,7 @@ namespace Grid {
  // Deterministic Reduction operations
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
-  ComplexD nrm = innerProduct(arg,arg);
+  auto nrm = innerProduct(arg,arg);
  return std::real(nrm); 
 }
@@ -43,11 +43,11 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  scalar_type  nrm;
  GridBase *grid = left._grid;
  const int pad = 8;
-  std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
+  scalar_type  nrm;
  std::vector<scalar_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()*pad);
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
@@ -57,18 +57,70 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
    }
-    sumarray[thr]=TensorRemove(vnrm) ;
+    // All threads sum across SIMD; reduce serial work at end
    // one write per cacheline with streaming store
    vstream(sumarray[thr*pad],Reduce(TensorRemove(vnrm))) ;
  }
-  vector_type vvnrm; vvnrm=zero;  // sum across threads
+  nrm=0.0;
  for(int i=0;i<grid->SumArraySize();i++){
-    vvnrm = vvnrm+sumarray[i];
+    nrm = nrm+sumarray[i*pad];
  } 
  nrm = Reduce(vvnrm);// sum across simd
  right._grid->GlobalSum(nrm);
  return nrm;
 }
 /////////////////////////
 // Fast axpby_norm
 // z = a x + b y
 // return norm z
 /////////////////////////
 template<class sobj,class vobj> strong_inline RealD 
 axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 {
  sobj one(1.0);
  return axpby_norm_fast(z,a,one,x,y);
 }
 template<class sobj,class vobj> strong_inline RealD 
 axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 {
  const int pad = 8;
  z.checkerboard = x.checkerboard;
  conformable(z,x);
  conformable(x,y);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x._grid;
  Vector<RealD> sumarray(grid->SumArraySize()*pad);
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
    int nwork, mywork, myoff;
    GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
    // private to thread; sub summation
    decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; 
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vnrm = vnrm + innerProductD(tmp,tmp);
      vstream(z._odata[ss],tmp);
    }
    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
  }
  nrm = 0.0; // sum across threads; linear in thread count but fast
  for(int i=0;i<grid->SumArraySize();i++){
    nrm = nrm+sumarray[i*pad];
  } 
  z._grid->GlobalSum(nrm);
  return nrm; 
 }
 template<class Op,class T1>
 inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object