/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: MomentumProject.h Copyright (C) 2025 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once NAMESPACE_BEGIN(Grid); /* MultiMomProject Import vectors -> nxyz x (ncomponent x nt) Import complex phases -> nmom x nxy apply = via (possibly batched) GEMM */ template class MomentumProject { public: typedef typename Field::scalar_type scalar; typedef typename Field::scalar_object scalar_object; GridBase *grid; uint64_t nmom; uint64_t nxyz; uint64_t nt; uint64_t nbtw; uint64_t words; deviceVector BLAS_V; // deviceVector BLAS_M; // deviceVector BLAS_P; // MomentumProject(){}; ~MomentumProject(){ Deallocate(); }; void Deallocate(void) { grid=nullptr; nmom=0; nxyz=0; nt=0; nbtw=0; words=0; BLAS_V.resize(0); BLAS_M.resize(0); BLAS_P.resize(0); } void Allocate(int _nmom,GridBase *_grid) { grid=_grid; Coordinate ldims = grid->LocalDimensions(); nmom=_nmom; nt = ldims[grid->Nd()-1]; nxyz = grid->lSites()/nt; words = sizeof(scalar_object)/sizeof(scalar); nbtw = nt * words; BLAS_V.resize (nxyz * nt * words ); BLAS_M.resize (nmom * nxyz ); BLAS_P.resize (nmom * nt * words ); } void ImportMomenta(const std::vector &momenta) { GRID_ASSERT(momenta.size()==nmom); // might as well just make the momenta here typedef typename Field::vector_object vobj; int nd = grid->_ndimension; uint64_t sz = BLAS_M.size(); GRID_ASSERT(momenta.size()==nmom) GRID_ASSERT(momenta[0].Grid()==grid); GRID_ASSERT(sz = nxyz * nmom); Coordinate rdimensions = grid->_rdimensions; Coordinate ldims = grid->LocalDimensions(); int64_t osites = grid->oSites(); Coordinate simd = grid->_simd_layout; const int Nsimd = vobj::Nsimd(); uint64_t lwords = words; // local variable for copy in to GPU int64_t Nxyz = nxyz; auto blasData_p = &BLAS_M[0]; for(int m=0;m_ndimension; uint64_t sz = BLAS_V.size(); GRID_ASSERT(sz = nxyz * words * nt); Coordinate rdimensions = grid->_rdimensions; Coordinate ldims= grid->LocalDimensions(); int64_t osites = grid->oSites(); Coordinate simd = grid->_simd_layout; const int Nsimd = vobj::Nsimd(); uint64_t lwords= words; // local variable for copy in to GPU auto blasData_p = &BLAS_V[0]; autoView( Data , vec, AcceleratorRead); auto Data_p = &Data[0]; int64_t nwords = words;// for capture int64_t Nt = nt;// for capture accelerator_for(sf,osites,Nsimd,{ #ifdef GRID_SIMT { int lane=acceleratorSIMTlane(Nsimd); // buffer lane #else for(int lane=0;lane &projection) { projection.resize(nmom*nt); acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar)); // Could decide on a layout late? } // Row major layout "C" order: // BLAS_V[slice_vol][nt][words] // BLAS_M[nmom][slice_vol] // BLAS_P[nmom][nt][words] // // Fortran Column major BLAS layout is V_(w,t)_xyz // Fortran Column major BLAS layout is M_xyz,mom // Fortran Column major BLAS layout is P_(w,t),mom // // Projected // // P = (V * M)_(w,t),mom // void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata) { double t_import=0; double t_export=0; double t_gemm =0; double t_allreduce=0; t_import-=usecond(); this->ImportVector(data); std::vector< typename Field::scalar_object > projected_planes; deviceVector Vd(1); deviceVector Md(1); deviceVector Pd(1); scalar * Vh = & BLAS_V[0]; scalar * Mh = & BLAS_M[0]; scalar * Ph = & BLAS_P[0]; acceleratorPut(Vd[0],Vh); acceleratorPut(Md[0],Mh); acceleratorPut(Pd[0],Ph); t_import+=usecond(); GridBLAS BLAS; ///////////////////////////////////////// // P_im = VMmx . Vxi ///////////////////////////////////////// t_gemm-=usecond(); BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, words*nt,nmom,nxyz, scalar(1.0), Vd, Md, scalar(0.0), // wipe out result Pd); BLAS.synchronise(); t_gemm+=usecond(); t_export-=usecond(); ExportMomentumProjection(projected_planes); // resizes t_export+=usecond(); ///////////////////////////////// // Reduce across MPI ranks ///////////////////////////////// int nd = grid->Nd(); int gt = grid->GlobalDimensions()[nd-1]; int lt = grid->LocalDimensions()[nd-1]; projected_gdata.resize(gt*nmom); for(int t=0;tLocalStarts()[nd-1]; projected_gdata[t+st + gt*m] = projected_planes[t+lt*m]; }} t_allreduce-=usecond(); grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words); t_allreduce+=usecond(); std::cout << GridLogPerformance<<" MomentumProject t_import "<