/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h Copyright (C) 2015 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once #include // needed for Dagger(Yes|No), Inverse(Yes|No) #include #include NAMESPACE_BEGIN(Grid); // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template class GeneralCoarsenedMatrix : public SparseMatrixBase > > { public: typedef GeneralCoarsenedMatrix GeneralCoarseOp; typedef iVector siteVector; typedef iMatrix siteMatrix; typedef Lattice > CoarseComplexField; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; typedef iMatrix Cobj; typedef iVector Cvec; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; typedef Lattice FineComplexField; typedef CoarseVector Field; //////////////////// // Data members //////////////////// int hermitian; GridBase * _FineGrid; GridCartesian * _CoarseGrid; NonLocalStencilGeometry &geom; PaddedCell Cell; GeneralLocalStencil Stencil; std::vector _A; std::vector _Adag; std::vector MultTemporaries; /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know GridBase * FineGrid(void) { return _FineGrid; }; // this is all the linalg routines need to know GridCartesian * CoarseGrid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know /* void ShiftMatrix(RealD shift) { int Nd=_FineGrid->Nd(); Coordinate zero_shift(Nd,0); for(int p=0;p &A,const CoarseVector &in, CoarseVector &out) { RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0; RealD tmult2=0; ttot=-usecond(); conformable(CoarseGrid(),in.Grid()); conformable(in.Grid(),out.Grid()); out.Checkerboard() = in.Checkerboard(); CoarseVector tin=in; texch-=usecond(); CoarseVector pin = Cell.ExchangePeriodic(tin); texch+=usecond(); CoarseVector pout(pin.Grid()); int npoint = geom.npoint; typedef LatticeView Aview; typedef LatticeView Vview; const int Nsimd = CComplex::Nsimd(); int64_t osites=pin.Grid()->oSites(); RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint + 2.0*osites*sizeof(siteVector)*npoint; { tviews-=usecond(); autoView( in_v , pin, AcceleratorRead); autoView( out_v , pout, AcceleratorWriteDiscard); autoView( Stencil_v , Stencil, AcceleratorRead); tviews+=usecond(); // Static and prereserve to keep UVM region live and not resized across multiple calls ttemps-=usecond(); MultTemporaries.resize(npoint,pin.Grid()); ttemps+=usecond(); std::vector AcceleratorViewContainer_h; std::vector AcceleratorVecViewContainer_h; tviews-=usecond(); for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); auto Aview_p = &AcceleratorViewContainer[0]; auto Vview_p = &AcceleratorVecViewContainer[0]; tcopy-=usecond(); acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); tcopy+=usecond(); tmult-=usecond(); accelerator_for(spb, osites*nbasis*npoint, Nsimd, { typedef decltype(coalescedRead(in_v[0](0))) calcComplex; int32_t ss = spb/(nbasis*npoint); int32_t bp = spb%(nbasis*npoint); int32_t point= bp/nbasis; int32_t b = bp%nbasis; auto SE = Stencil_v.GetEntry(point,ss); auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd); auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0); for(int bb=1;bbgSites() ;bidx++){ Coordinate bcoor; CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor); for(int p=0;pGlobalDimensions()[mu]; scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic } // Flip to poke/peekLocalSite and not too bad auto link = peekSite(_A[p],scoor); int pp = geom.Reverse(p); pokeSite(adj(link),_Adag[pp],bcoor); } } } ///////////////////////////////////////////////////////////// // // A) Only reduced flops option is to use a padded cell of depth 4 // and apply MpcDagMpc in the padded cell. // // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80. // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio // Cost is 81x more, same as stencil size. // // But: can eliminate comms and do as local dirichlet. // // Local exchange gauge field once. // Apply to all vectors, local only computation. // Must exchange ghost subcells in reverse process of PaddedCell to take inner products // // B) Can reduce cost: pad by 1, apply Deo (4^4+6^4+8^4+8^4 )/ (4x 4^4) // pad by 2, apply Doe // pad by 3, apply Deo // then break out 8x directions; cost is ~10x MpcDagMpc per vector // // => almost factor of 10 in setup cost, excluding data rearrangement // // Intermediates -- ignore the corner terms, leave approximate and force Hermitian // Intermediates -- pad by 2 and apply 1+8+24 = 33 times. ///////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////// // BFM HDCG style approach: Solve a system of equations to get Aij ////////////////////////////////////////////////////////// /* * Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM. * * conj(phases[block]) proj[k][ block*Nvec+j ] = \sum_ball e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > * = \sum_ball e^{iqk.delta} A_ji * * Must invert matrix M_k,l = e^[i q_k . delta_l] * * Where q_k = delta_k . (2*M_PI/global_nb[mu]) */ #if 0 void CoarsenOperator(LinearOperatorBase > &linop, Aggregation & Subspace) { std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; GridBase *grid = FineGrid(); RealD tproj=0.0; RealD teigen=0.0; RealD tmat=0.0; RealD tphase=0.0; RealD tinv=0.0; ///////////////////////////////////////////////////////////// // Orthogonalise the subblocks over the basis ///////////////////////////////////////////////////////////// CoarseScalar InnerProd(CoarseGrid()); blockOrthogonalise(InnerProd,Subspace.subspace); const int npoint = geom.npoint; Coordinate clatt = CoarseGrid()->GlobalDimensions(); int Nd = CoarseGrid()->Nd(); /* * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. * Matrix index i is mapped to this shift via * geom.shifts[i] * * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} * = M_{kl} A_ji^{b.b+l} * * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] * * Where q_k = delta_k . (2*M_PI/global_nb[mu]) * * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} */ teigen-=usecond(); Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); ComplexD ci(0.0,1.0); for(int k=0;k ComputeProj(npoint,CoarseGrid()); std::vector FT(npoint,CoarseGrid()); for(int i=0;ioSites(); autoView( A_v , _A[k], AcceleratorWrite); autoView( FT_v , FT[k], AcceleratorRead); accelerator_for(sss, osites, 1, { for(int j=0;j > &linop, Aggregation & Subspace) { std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; GridBase *grid = FineGrid(); RealD tproj=0.0; RealD teigen=0.0; RealD tmat=0.0; RealD tphase=0.0; RealD tphaseBZ=0.0; RealD tinv=0.0; ///////////////////////////////////////////////////////////// // Orthogonalise the subblocks over the basis ///////////////////////////////////////////////////////////// CoarseScalar InnerProd(CoarseGrid()); blockOrthogonalise(InnerProd,Subspace.subspace); // for(int s=0;sGlobalDimensions(); int Nd = CoarseGrid()->Nd(); /* * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. * Matrix index i is mapped to this shift via * geom.shifts[i] * * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} * = M_{kl} A_ji^{b.b+l} * * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] * * Where q_k = delta_k . (2*M_PI/global_nb[mu]) * * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} */ teigen-=usecond(); Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); ComplexD ci(0.0,1.0); for(int k=0;k phaF(npoint,grid); std::vector pha(npoint,CoarseGrid()); CoarseVector coarseInner(CoarseGrid()); typedef typename CComplex::scalar_type SComplex; FineComplexField one(grid); one=SComplex(1.0); FineComplexField zz(grid); zz = Zero(); tphase=-usecond(); for(int p=0;p ComputeProj(npoint,CoarseGrid()); std::vector FT(npoint,CoarseGrid()); for(int i=0;ioSites(); autoView( A_v , _A[k], AcceleratorWrite); autoView( FT_v , FT[k], AcceleratorRead); accelerator_for(sss, osites, 1, { for(int j=0;j &out){assert(0);}; }; NAMESPACE_END(Grid);