/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/Lattice_reduction.h Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H #include namespace Grid { #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " #endif //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ auto nrm = innerProduct(arg,arg); return std::real(nrm); } // Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; GridBase *grid = left._grid; const int pad = 8; ComplexD inner; Vector sumarray(grid->SumArraySize()*pad); parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation for(int ss=myoff;ssSumArraySize();i++){ inner = inner+sumarray[i*pad]; } right._grid->GlobalSum(inner); return inner; } ///////////////////////// // Fast axpby_norm // z = a x + b y // return norm z ///////////////////////// template strong_inline RealD axpy_norm_fast(Lattice &z,sobj a,const Lattice &x,const Lattice &y) { sobj one(1.0); return axpby_norm_fast(z,a,one,x,y); } template strong_inline RealD axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Lattice &y) { const int pad = 8; z.checkerboard = x.checkerboard; conformable(z,x); conformable(x,y); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; RealD nrm; GridBase *grid = x._grid; Vector sumarray(grid->SumArraySize()*pad); parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); // private to thread; sub summation decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; for(int ss=myoff;ssSumArraySize();i++){ nrm = nrm+sumarray[i*pad]; } z._grid->GlobalSum(nrm); return nrm; } template inline auto sum(const LatticeUnaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object { return sum(closure(expr)); } template inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object { return sum(closure(expr)); } template inline auto sum(const LatticeTrinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), eval(0,std::get<1>(expr.second)), eval(0,std::get<2>(expr.second)) ))::scalar_object { return sum(closure(expr)); } template inline typename vobj::scalar_object sum(const Lattice &arg) { GridBase *grid=arg._grid; int Nsimd = grid->Nsimd(); std::vector > sumarray(grid->SumArraySize()); for(int i=0;iSumArraySize();i++){ sumarray[i]=zero; } parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(grid->oSites(),thr,mywork,myoff); vobj vvsum=zero; for(int ss=myoff;ssSumArraySize();i++){ vsum = vsum+sumarray[i]; } typedef typename vobj::scalar_object sobj; sobj ssum=zero; std::vector buf(Nsimd); extract(vsum,buf); for(int i=0;iGlobalSum(ssum); return ssum; } ////////////////////////////////////////////////////////////////////////////////////////////////////////////// // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... ////////////////////////////////////////////////////////////////////////////////////////////////////////////// template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { /////////////////////////////////////////////////////// // FIXME precision promoted summation // may be important for correlation functions // But easily avoided by using double precision fields /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); const int Nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); assert(orthogdim >= 0); assert(orthogdim < Nd); int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane for(int n=0;n icoor(Nd); for(int rt=0;rtiCoorFromIindex(icoor,idx); int ldx =rt+icoor[orthogdim]*rd; lsSum[ldx]=lsSum[ldx]+extracted[idx]; } } // sum over nodes. sobj gsum; for(int t=0;t_processor_coor[orthogdim] ) { gsum=lsSum[lt]; } else { gsum=zero; } grid->GlobalSum(gsum); result[t]=gsum; } } template static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; GridBase *grid = lhs._grid; assert(grid!=NULL); conformable(grid,rhs._grid); const int Nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); assert(orthogdim >= 0); assert(orthogdim < Nd); int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars std::vector > extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane for(int n=0;n icoor(Nd); for(int rt=0;rt temp; temp._internal = lvSum[rt]; extract(temp,extracted); for(int idx=0;idxiCoorFromIindex(icoor,idx); int ldx =rt+icoor[orthogdim]*rd; lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; } } // sum over nodes. scalar_type gsum; for(int t=0;t_processor_coor[orthogdim] ) { gsum=lsSum[lt]; } else { gsum=scalar_type(0.0); } grid->GlobalSum(gsum); result[t]=gsum; } } template static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = rhs._grid->GlobalDimensions()[Orthog]; std::vector ip(Nblock); sn.resize(Nblock); sliceInnerProductVector(ip,rhs,rhs,Orthog); for(int ss=0;ss static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, int orthogdim,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef typename vobj::tensor_reduced tensor_reduced; scalar_type zscale(scale); GridBase *grid = X._grid; int Nsimd =grid->Nsimd(); int Nblock =grid->GlobalDimensions()[orthogdim]; int fd =grid->_fdimensions[orthogdim]; int ld =grid->_ldimensions[orthogdim]; int rd =grid->_rdimensions[orthogdim]; int e1 =grid->_slice_nblock[orthogdim]; int e2 =grid->_slice_block [orthogdim]; int stride =grid->_slice_stride[orthogdim]; std::vector icoor; for(int r=0;r_ostride[orthogdim]; // base offset for start of plane vector_type av; for(int l=0;liCoorFromIindex(icoor,l); int ldx =r+icoor[orthogdim]*rd; scalar_type *as =(scalar_type *)&av; as[l] = scalar_type(a[ldx])*zscale; } tensor_reduced at; at=av; parallel_for_nest2(int n=0;n_ndimension; int nsimd = BlockSolverGrid->Nsimd(); std::vector latt_phys(0); std::vector simd_phys(0); std::vector mpi_phys(0); for(int d=0;d_fdimensions[d]); simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); mpi_phys.push_back(BlockSolverGrid->_processors[d]); } } return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } */ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X._grid->GlobalDimensions()[Orthog]; GridBase *FullGrid = X._grid; // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // Lattice Xslice(SliceGrid); // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; // int nl = SliceGrid->_ndimension; int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; #pragma omp parallel { std::vector s_x(Nblock); #pragma omp for collapse(2) for(int n=0;n static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X._grid->GlobalDimensions()[Orthog]; GridBase *FullGrid = X._grid; // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // Lattice Xslice(SliceGrid); // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; // int nl = SliceGrid->_ndimension; int nl=1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; #pragma omp parallel { std::vector s_x(Nblock); #pragma omp for collapse(2) for(int n=0;n static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs._grid; // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); int Nblock = FullGrid->GlobalDimensions()[Orthog]; // Lattice Lslice(SliceGrid); // Lattice Rslice(SliceGrid); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; // int nl = SliceGrid->_ndimension; int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; typedef typename vobj::vector_typeD vector_typeD; #pragma omp parallel { std::vector Left(Nblock); std::vector Right(Nblock); Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); #pragma omp for collapse(2) for(int n=0;nGlobalSum(sum); mat(i,j)=sum; }} return; } } /*END NAMESPACE GRID*/ #endif