/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/Lattice_reduction.h Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H namespace Grid { template class ReproducibilityState { public: typedef typename T::vector_type vector_type; typedef std::vector > sum_type; unsigned int n_call; bool do_check; bool enable_reprocheck; bool success; unsigned int interval; std::vector th_states; void reset_counter() { n_call = 0; } void reset() { th_states.clear(); do_check = false; enable_reprocheck = false; success = true; n_call = 0; }; ReproducibilityState():interval(1) { reset(); } void check(GridBase* grid, sum_type &sumarray){ /////////////////////// Reproducibility section, not threaded on purpouse if (enable_reprocheck) { if (do_check && (n_call % interval) == 0) { for (int thread = 0; thread < sumarray.size(); thread++) { int words = sizeof(sumarray[thread])/sizeof(unsigned char); unsigned char xors[words]; bitwise_xor(sumarray[thread], th_states[n_call][thread],xors); // OR all words unsigned char res = 0; for (int w = 0; w < words; w++) res = res | xors[w]; Grid_unquiesce_nodes(); int rank = 0; while (rank < grid->_Nprocessors){ if (rank == grid->ThisRank() ){ if ( res ) { std::cout << "Reproducibility failure report" << std::endl; grid->PrintRankInfo(); std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl; std::cout << "Size of states: " << th_states.size() << std::endl; std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific; std::cout << "Saved partial sum : " << th_states[n_call][thread] << std::endl; std::cout << "Current partial sum: " << sumarray[thread] << std::endl; std::cout << "Saved state " << std::endl; show_binaryrep(th_states[n_call][thread]); std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]); std::cout << "XOR result" << std::endl; show_binaryrep(xors, words); //std::cout << std::defaultfloat; //not supported by some compilers std::cout << std::setprecision(6); success = false; } } rank++; grid->Barrier(); } Grid_quiesce_nodes(); } } else if (!do_check) { std::cout << GridLogDebug << "Saving thread state for inner product. Call n. " << n_call << std::endl; th_states.resize(n_call+1); th_states[n_call].resize(grid->SumArraySize()); th_states[n_call] = sumarray; // save threads state //n_call++; } n_call++; } } }; #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " #endif //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ ReproducibilityState repr; ComplexD nrm = innerProduct(arg, arg, repr); return std::real(nrm); } template inline RealD norm2(const Lattice &arg, ReproducibilityState& rep) { ComplexD nrm = innerProduct(arg, arg, rep); return std::real(nrm); } template inline ComplexD innerProduct(const Lattice &left,const Lattice &right){ ReproducibilityState repr; return innerProduct(left, right, repr); } template inline ComplexD innerProduct(const Lattice &left,const Lattice &right, ReproducibilityState& repr) { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; scalar_type nrm; GridBase *grid = left._grid; std::vector > sumarray(grid->SumArraySize()); for(int i=0;iSumArraySize();i++){ sumarray[i]=zero; } PARALLEL_FOR_LOOP_STATIC //request statically scheduled threads for reproducibility for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); decltype(innerProduct(left._odata[0],right._odata[0])) vnrm = zero; // private to thread; sub summation for(int ss=myoff;ssSumArraySize();i++){ vvnrm = vvnrm+sumarray[i]; } nrm = Reduce(vvnrm);// sum across simd right._grid->GlobalSum(nrm); return nrm; } template inline auto sum(const LatticeUnaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object { return sum(closure(expr)); } template inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object { return sum(closure(expr)); } template inline auto sum(const LatticeTrinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), eval(0,std::get<1>(expr.second)), eval(0,std::get<2>(expr.second)) ))::scalar_object { return sum(closure(expr)); } template inline typename vobj::scalar_object sum(const Lattice &arg){ GridBase *grid=arg._grid; int Nsimd = grid->Nsimd(); std::vector > sumarray(grid->SumArraySize()); for(int i=0;iSumArraySize();i++){ sumarray[i]=zero; } PARALLEL_FOR_LOOP for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(grid->oSites(),thr,mywork,myoff); vobj vvsum=zero; for(int ss=myoff;ssSumArraySize();i++){ vsum = vsum+sumarray[i]; } typedef typename vobj::scalar_object sobj; sobj ssum=zero; std::vector buf(Nsimd); extract(vsum,buf); for(int i=0;iGlobalSum(ssum); return ssum; } template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); // FIXME // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); assert(orthogdim >= 0); assert(orthogdim < Nd); int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r coor(Nd); // sum over reduced dimension planes, breaking out orthog dir for(int ss=0;ssoSites();ss++){ Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); int r = coor[orthogdim]; lvSum[r]=lvSum[r]+Data._odata[ss]; } // Sum across simd lanes in the plane, breaking out orthog dir. std::vector icoor(Nd); for(int rt=0;rtiCoorFromIindex(icoor,idx); int ldx =rt+icoor[orthogdim]*rd; lsSum[ldx]=lsSum[ldx]+extracted[idx]; } } // sum over nodes. sobj gsum; for(int t=0;t_processor_coor[orthogdim] ) { gsum=lsSum[lt]; } else { gsum=zero; } grid->GlobalSum(gsum); result[t]=gsum; } } } #endif