From ee0d460c8e7237cb44489d16cf3f7bb6b4f5708d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Feb 2024 11:41:44 -0500 Subject: [PATCH] Blas based block project & deflate for multiRHS --- .../deflation/MultiRHSBlockProject.h | 512 ++++++++++++++++++ Grid/algorithms/deflation/MultiRHSDeflation.h | 234 ++++++++ 2 files changed, 746 insertions(+) create mode 100644 Grid/algorithms/deflation/MultiRHSBlockProject.h create mode 100644 Grid/algorithms/deflation/MultiRHSDeflation.h diff --git a/Grid/algorithms/deflation/MultiRHSBlockProject.h b/Grid/algorithms/deflation/MultiRHSBlockProject.h new file mode 100644 index 00000000..59499cbb --- /dev/null +++ b/Grid/algorithms/deflation/MultiRHSBlockProject.h @@ -0,0 +1,512 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: MultiRHSDeflation.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + + +/* + MultiRHS block projection + + Import basis -> nblock x nbasis x (block x internal) + Import vector of fine lattice objects -> nblock x nrhs x (block x internal) + + => coarse_(nrhs x nbasis )^block = via batched GEMM + +//template +//inline void blockProject(Lattice > &coarseData, +// const VLattice &fineData, +// const VLattice &Basis) +*/ + +template +class MultiRHSBlockProject +{ +public: + + typedef typename Field::scalar_type scalar; + typedef typename Field::scalar_object scalar_object; + typedef Field Fermion; + + int nbasis; + GridBase *coarse_grid; + GridBase *fine_grid; + uint64_t block_vol; + uint64_t fine_vol; + uint64_t coarse_vol; + uint64_t words; + + // Row major layout "C" order: + // BLAS_V[coarse_vol][nbasis][block_vol][words] + // BLAS_F[coarse_vol][nrhs][block_vol][words] + // BLAS_C[coarse_vol][nrhs][nbasis] + /* + * in Fortran column major notation (cuBlas order) + * + * Vxb = [v1(x)][..][vn(x)] ... x coarse vol + * + * Fxr = [r1(x)][..][rm(x)] ... x coarse vol + * + * Block project: + * C_br = V^dag F x coarse vol + * + * Block promote: + * F_xr = Vxb Cbr x coarse_vol + */ + deviceVector BLAS_V; // words * block_vol * nbasis x coarse_vol + deviceVector BLAS_F; // nrhs x fine_vol * words -- the sources + deviceVector BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs + + RealD blasNorm2(deviceVector &blas) + { + scalar ss(0.0); + std::vector tmp(blas.size()); + acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar)); + for(int64_t s=0;sGlobalSum(ss); + return real(ss); + } + + MultiRHSBlockProject(){}; + ~MultiRHSBlockProject(){ Deallocate(); }; + + void Deallocate(void) + { + nbasis=0; + coarse_grid=nullptr; + fine_grid=nullptr; + fine_vol=0; + block_vol=0; + coarse_vol=0; + words=0; + BLAS_V.resize(0); + BLAS_F.resize(0); + BLAS_C.resize(0); + } + void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid) + { + nbasis=_nbasis; + + fine_grid=_fgrid; + coarse_grid=_cgrid; + + fine_vol = fine_grid->lSites(); + coarse_vol = coarse_grid->lSites(); + block_vol = fine_vol/coarse_vol; + + words = sizeof(scalar_object)/sizeof(scalar); + + BLAS_V.resize (fine_vol * words * nbasis ); + } + void ImportFineGridVectors(std::vector &vecs, deviceVector &blas) + { + int nvec = vecs.size(); + typedef typename Field::vector_object vobj; + std::cout << " BlockProjector importing "<_ndimension; + assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); + + Coordinate block_r (_ndimension); + for(int d=0 ; d<_ndimension;d++){ + block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d]; + } + + uint64_t sz = blas.size(); + + acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar)); + + Coordinate fine_rdimensions = fine_grid->_rdimensions; + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + int64_t bv= block_vol; + for(int v=0;voSites(); + + // loop over fine sites + const int Nsimd = vobj::Nsimd(); + // std::cout << "sz "<oSites() * block_vol * nvec * words<oSites() * block_vol * nvec * words); + uint64_t lwords= words; // local variable for copy in to GPU + accelerator_for(sf,osites,Nsimd,{ +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane &vecs, deviceVector &blas) + { + typedef typename Field::vector_object vobj; + + int nvec = vecs.size(); + + assert(vecs[0].Grid()==fine_grid); + + subdivides(coarse_grid,fine_grid); // require they map + + int _ndimension = coarse_grid->_ndimension; + assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); + + Coordinate block_r (_ndimension); + for(int d=0 ; d<_ndimension;d++){ + block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d]; + } + Coordinate fine_rdimensions = fine_grid->_rdimensions; + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + + // std::cout << " export fine Blas norm "<oSites(); + uint64_t lwords = words; + // std::cout << " Nsimd is "< + void ImportCoarseGridVectors(std::vector > &vecs, deviceVector &blas) + { + int nvec = vecs.size(); + typedef typename vobj::scalar_object coarse_scalar_object; + + std::cout << " BlockProjector importing coarse grid "<_ndimension; + + uint64_t sz = blas.size(); + + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + + for(int v=0;voSites(); + + // loop over fine sites + const int Nsimd = vobj::Nsimd(); + uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); + assert(cwords==nbasis); + + accelerator_for(sc,osites,Nsimd,{ +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane + void ExportCoarseGridVectors(std::vector > &vecs, deviceVector &blas) + { + int nvec = vecs.size(); + typedef typename vobj::scalar_object coarse_scalar_object; + std::cout << " BlockProjector importing coarse grid "<_ndimension; + + uint64_t sz = blas.size(); + + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + + // std::cout << " export coarsee Blas norm "<oSites(); + + // loop over fine sites + const int Nsimd = vobj::Nsimd(); + uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); + assert(cwords==nbasis); + + accelerator_for(sc,osites,Nsimd,{ + // Wrap in a macro "FOR_ALL_LANES(lane,{ ... }); +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane &vecs) + { + // std::cout << " BlockProjector Import basis size "< + void blockProject(std::vector &fine,std::vector< Lattice > & coarse) + { + int nrhs=fine.size(); + int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); + assert(nbasis==_nbasis); + + BLAS_F.resize (fine_vol * words * nrhs ); + BLAS_C.resize (coarse_vol * nbasis * nrhs ); + + ///////////////////////////////////////////// + // Copy in the multi-rhs sources to same data layout + ///////////////////////////////////////////// + // std::cout << "BlockProject import fine"< Vd(coarse_vol); + deviceVector Fd(coarse_vol); + deviceVector Cd(coarse_vol); + + // std::cout << "BlockProject pointers"< + void blockPromote(std::vector &fine,std::vector > & coarse) + { + int nrhs=fine.size(); + int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); + assert(nbasis==_nbasis); + + BLAS_F.resize (fine_vol * words * nrhs ); + BLAS_C.resize (coarse_vol * nbasis * nrhs ); + + ImportCoarseGridVectors(coarse, BLAS_C); + + GridBLAS BLAS; + + deviceVector Vd(coarse_vol); + deviceVector Fd(coarse_vol); + deviceVector Cd(coarse_vol); + + for(int c=0;c + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + + +/* Need helper object for BLAS accelerated mrhs projection + + i) MultiRHS Deflation + + Import Evecs -> nev x vol x internal + Import vector of Lattice objects -> nrhs x vol x internal + => Cij (nrhs x Nev) via GEMM. + => Guess (nrhs x vol x internal) = C x evecs (via GEMM) + Export + + + ii) MultiRHS block projection + + Import basis -> nblock x nbasis x (block x internal) + Import vector of fine lattice objects -> nblock x nrhs x (block x internal) + + => coarse_(nrhs x nbasis )^block = via batched GEMM + + iii) Alternate interface: + Import higher dim Lattice object-> vol x nrhs layout + +*/ +template +class MultiRHSDeflation +{ +public: + + typedef typename Field::scalar_type scalar; + typedef typename Field::scalar_object scalar_object; + + int nev; + std::vector eval; + GridBase *grid; + uint64_t vol; + uint64_t words; + + deviceVector BLAS_E; // nev x vol -- the eigenbasis (up to a 1/sqrt(lambda)) + deviceVector BLAS_R; // nrhs x vol -- the sources + deviceVector BLAS_G; // nrhs x vol -- the guess + deviceVector BLAS_C; // nrhs x nev -- the coefficients + + MultiRHSDeflation(){}; + ~MultiRHSDeflation(){ Deallocate(); }; + + void Deallocate(void) + { + nev=0; + grid=nullptr; + vol=0; + words=0; + BLAS_E.resize(0); + BLAS_R.resize(0); + BLAS_C.resize(0); + BLAS_G.resize(0); + } + void Allocate(int _nev,GridBase *_grid) + { + nev=_nev; + grid=_grid; + vol = grid->lSites(); + words = sizeof(scalar_object)/sizeof(scalar); + eval.resize(nev); + BLAS_E.resize (vol * words * nev ); + std::cout << GridLogMessage << " Allocate for "< &evec,std::vector &_eval) + { + ImportEigenBasis(evec,_eval,0,evec.size()); + } + // Could use to import a batch of eigenvectors + void ImportEigenBasis(std::vector &evec,std::vector &_eval, int _ev0, int _nev) + { + assert(_ev0+_nev<=evec.size()); + + Allocate(_nev,evec[0].Grid()); + + // Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1 + for(int e=0;e &source,std::vector & guess) + { + int nrhs = source.size(); + assert(source.size()==guess.size()); + assert(grid == guess[0].Grid()); + conformable(guess[0],source[0]); + + int64_t vw = vol * words; + + std::cout << GridLogMessage << "MultiRHSDelation for "<