From 4397b1c442b699e5b03f374ebd49d0070260fa1d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 7 Aug 2025 15:51:01 +0000 Subject: [PATCH] Debugged momentum projection for A2A Meson Field --- Grid/algorithms/blas/MomentumProject.h | 282 +++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 Grid/algorithms/blas/MomentumProject.h diff --git a/Grid/algorithms/blas/MomentumProject.h b/Grid/algorithms/blas/MomentumProject.h new file mode 100644 index 00000000..888257af --- /dev/null +++ b/Grid/algorithms/blas/MomentumProject.h @@ -0,0 +1,282 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: MomentumProject.h + + Copyright (C) 2025 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); +/* + MultiMomProject + + Import vectors -> nxyz x (ncomponent x nt) + Import complex phases -> nmom x nxy + + apply = via (possibly batched) GEMM +*/ +template +class MomentumProject +{ +public: + + typedef typename Field::scalar_type scalar; + typedef typename Field::scalar_object scalar_object; + + GridBase *grid; + uint64_t nmom; + uint64_t nxyz; + uint64_t nt; + uint64_t nbtw; + uint64_t words; + + deviceVector BLAS_V; // + deviceVector BLAS_M; // + deviceVector BLAS_P; // + + MomentumProject(){}; + ~MomentumProject(){ Deallocate(); }; + + void Deallocate(void) + { + grid=nullptr; + nmom=0; + nxyz=0; + nt=0; + nbtw=0; + words=0; + BLAS_V.resize(0); + BLAS_M.resize(0); + BLAS_P.resize(0); + } + void Allocate(int _nmom,GridBase *_grid) + { + grid=_grid; + Coordinate ldims = grid->LocalDimensions(); + + nmom=_nmom; + nt = ldims[grid->Nd()-1]; + nxyz = grid->lSites()/nt; + words = sizeof(scalar_object)/sizeof(scalar); + nbtw = nt * words; + + BLAS_V.resize (nxyz * nt * words ); + BLAS_M.resize (nmom * nxyz ); + BLAS_P.resize (nmom * nt * words ); + } + void ImportMomenta(const std::vector &momenta) + { + GRID_ASSERT(momenta.size()==nmom); + // might as well just make the momenta here + typedef typename Field::vector_object vobj; + + int nd = grid->_ndimension; + + uint64_t sz = BLAS_M.size(); + + GRID_ASSERT(momenta.size()==nmom) + GRID_ASSERT(momenta[0].Grid()==grid); + GRID_ASSERT(sz = nxyz * nmom); + + Coordinate rdimensions = grid->_rdimensions; + Coordinate ldims = grid->LocalDimensions(); + int64_t osites = grid->oSites(); + Coordinate simd = grid->_simd_layout; + const int Nsimd = vobj::Nsimd(); + uint64_t lwords = words; // local variable for copy in to GPU + int64_t Nxyz = nxyz; + auto blasData_p = &BLAS_M[0]; + for(int m=0;m_ndimension; + + uint64_t sz = BLAS_V.size(); + + GRID_ASSERT(sz = nxyz * words * nt); + + Coordinate rdimensions = grid->_rdimensions; + Coordinate ldims= grid->LocalDimensions(); + int64_t osites = grid->oSites(); + Coordinate simd = grid->_simd_layout; + const int Nsimd = vobj::Nsimd(); + uint64_t lwords= words; // local variable for copy in to GPU + + auto blasData_p = &BLAS_V[0]; + autoView( Data , vec, AcceleratorRead); + auto Data_p = &Data[0]; + + int64_t nwords = words;// for capture + int64_t Nt = nt;// for capture + + accelerator_for(sf,osites,Nsimd,{ +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane &projection) + { + projection.resize(nmom*nt); + acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar)); + // Could decide on a layout late? + } + + // Row major layout "C" order: + // BLAS_V[slice_vol][nt][words] + // BLAS_M[nmom][slice_vol] + // BLAS_P[nmom][nt][words] + // + // Fortran Column major BLAS layout is V_(w,t)_xyz + // Fortran Column major BLAS layout is M_xyz,mom + // Fortran Column major BLAS layout is P_(w,t),mom + // + // Projected + // + // P = (V * M)_(w,t),mom + // + void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata) + { + this->ImportVector(data); + + std::vector< typename Field::scalar_object > projected_planes; + + deviceVector Vd(1); + deviceVector Md(1); + deviceVector Pd(1); + + scalar * Vh = & BLAS_V[0]; + scalar * Mh = & BLAS_M[0]; + scalar * Ph = & BLAS_P[0]; + + acceleratorPut(Vd[0],Vh); + acceleratorPut(Md[0],Mh); + acceleratorPut(Pd[0],Ph); + + GridBLAS BLAS; + + ///////////////////////////////////////// + // P_im = VMmx . Vxi + ///////////////////////////////////////// + BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + words*nt,nmom,nxyz, + scalar(1.0), + Vd, + Md, + scalar(0.0), // wipe out result + Pd); + BLAS.synchronise(); + + ExportMomentumProjection(projected_planes); // resizes + + ///////////////////////////////// + // Reduce across MPI ranks + ///////////////////////////////// + int nd = grid->Nd(); + int gt = grid->GlobalDimensions()[nd-1]; + int lt = grid->LocalDimensions()[nd-1]; + projected_gdata.resize(gt*nmom); + for(int t=0;tLocalStarts()[nd-1]; + projected_gdata[t+st + gt*m] = projected_planes[t+lt*m]; + }} + grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words); + } +}; + +NAMESPACE_END(Grid);