diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 0fa039c8..26746e6e 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -308,32 +308,34 @@ namespace Grid { public: SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + GridLogIterative.TimingMode(1); + std::cout << GridLogIterative << " HermOpAndNorm "< &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); - assert(_ndimension = parent._ndimension); - + + int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); + std::vector parent_processor_coor(_ndimension,0); + std::vector parent_processors (_ndimension,1); + + // Can make 5d grid from 4d etc... + int pad = _ndimension-parent_ndimension; + for(int d=0;d &processors, std::vector ssize(_ndimension); // coor of split within parent for(int d=0;d<_ndimension;d++){ - ccoor[d] = parent._processor_coor[d] % processors[d]; - scoor[d] = parent._processor_coor[d] / processors[d]; - ssize[d] = parent._processors[d] / processors[d]; + ccoor[d] = parent_processor_coor[d] % processors[d]; + scoor[d] = parent_processor_coor[d] / processors[d]; + ssize[d] = parent_processors[d] / processors[d]; } int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms // Mpi uses the reverse Lexico convention to us @@ -166,38 +176,36 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, MPI_Comm comm_split; if ( Nchild > 1 ) { - /* - std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec< &processors, // Set up from the new split communicator ////////////////////////////////////////////////////////////////////////////////////////////////////// InitFromMPICommunicator(processors,comm_split); + + if(0){ + std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; + for(int d=0;d &proc MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); - if ( communicator_base != communicator_world ) { + if ( 0 && (communicator_base != communicator_world) ) { std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"< inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ - std::vector coor; + + parallel_for(int ss=0;ssoSites();ss++){ int cbos; - + std::vector coor; full._grid->oCoorFromOindex(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); half._odata[ssh] = full._odata[ss]; - ssh++; } } } template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ + parallel_for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; @@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine) cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); full._odata[ss]=half._odata[ssh]; - ssh++; } } } @@ -698,30 +694,6 @@ void precisionChange(Lattice &out, const Lattice &in){ //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// -// -// All to all plan -// -// Subvolume on fine grid is v. Vectors a,b,c,d -// -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SIMPLEST CASE: -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Mesh of nodes (2) ; subdivide to 1 subdivisions -// -// Lex ord: -// N0 va0 vb0 N1 va1 vb1 -// -// For each dimension do an all to all -// -// full AllToAll(0) -// N0 va0 va1 N1 vb0 vb1 -// -// REARRANGE -// N0 va01 N1 vb01 -// -// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". -// NB: Easiest to programme if keep in lex order. -// /////////////////////////////////////////////////////////////////////////////////////////////////////////// // SIMPLE CASE: /////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -755,9 +727,17 @@ void precisionChange(Lattice &out, const Lattice &in){ // // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". // NB: Easiest to programme if keep in lex order. -// -///////////////////////////////////////////////////////// - +/* + * Let chunk = (fvol*nvec)/sP be size of a chunk. ( Divide lexico vol * nvec into fP/sP = M chunks ) + * + * 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) + * + * node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth + * node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1).. + * node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * etc... + */ template void Grid_split(std::vector > & full,Lattice & split) { @@ -816,57 +796,58 @@ void Grid_split(std::vector > & full,Lattice & split) int nvec = nvector; // Counts down to 1 as we collapse dims std::vector ldims = full_grid->_ldimensions; - std::vector lcoor(ndim); for(int d=ndim-1;d>=0;d--){ if ( ratio[d] != 1 ) { full_grid ->AllToAll(d,alldata,tmpdata); - // std::cout << GridLogMessage << "Grid_split: dim " <_processors[d]<_processors[d] > 1 ) { + alldata=tmpdata; + split_grid->AllToAll(d,alldata,tmpdata); + } - auto rdims = ldims; rdims[d] *= ratio[d]; - auto rsites= lsites*ratio[d]; - for(int v=0;v_processors[d]; + int fP = full_grid->_processors[d]; - Lexicographic::CoorFromIndex(lcoor, lsite, ldims); + int fvol = lsites; + + int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); - for(int r=0;r coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; - int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims); - rsite += v * rsites; + // LexicoFind coordinate & vector number within split lattice + alldata[lex_r] = tmpdata[lex_c]; - int rmul=nvec*lsites; - int vmul= lsites; - alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul]; - // if ( lsite==0 ) { - // std::cout << "Grid_split: grow alldata["<_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } } } vectorizeFromLexOrdArray(alldata,split); @@ -937,59 +918,61 @@ void Grid_unsplit(std::vector > & full,Lattice & split) ///////////////////////////////////////////////////////////////// // Start from split grid and work towards full grid ///////////////////////////////////////////////////////////////// - std::vector lcoor(ndim); - std::vector rcoor(ndim); int nvec = 1; - lsites = split_grid->lSites(); - std::vector ldims = split_grid->_ldimensions; + uint64_t rsites = split_grid->lSites(); + std::vector rdims = split_grid->_ldimensions; - // for(int d=ndim-1;d>=0;d--){ for(int d=0;d_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } - - ////////////////////////////////////////// - //Local volume for this dimension is expanded by ratio of processor extents - // Number of vectors is decreased by same factor - // Rearrange to lexico for bigger volume - ////////////////////////////////////////// - auto rsites= lsites/ratio[d]; - auto rdims = ldims; rdims[d]/=ratio[d]; - - for(int v=0;v smaller local volume - // lsite, lcoor --> bigger original (single node?) volume - // For loop over each site within smaller subvol - for(int rsite=0;rsite_processors[d]; + int fP = full_grid->_processors[d]; + + auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor + auto lsites= rsites/M; // Decreases rsites by M + + int fvol = lsites; + int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); + + { + // Loop over reordered data post A2A + for(int c=0;c coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; + + // LexicoFind coordinate & vector number within split lattice + tmpdata[lex_c] = alldata[lex_r]; + } } } } - nvec *= ratio[d]; - ldims[d]=rdims[d]; - lsites =rsites; + if ( split_grid->_processors[d] > 1 ) { + split_grid->AllToAll(d,tmpdata,alldata); + tmpdata=alldata; + } full_grid ->AllToAll(d,tmpdata,alldata); + rdims[d]/= M; + rsites /= M; + nvec *= M; // Increase nvec by subdivision factor } } @@ -997,12 +980,12 @@ void Grid_unsplit(std::vector > & full,Lattice & split) for(int v=0;v HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-5),10000); + ConjugateGradient CG((1.0e-2),10000); s_res = zero; CG(HermOp,s_src,s_res); - std::cout << " s_res norm "< CG(1.0e-8,10000); SchurRedBlackStaggeredSolve SchurSolver(CG); + double volume=1.0; + for(int mu=0;mu volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout<