From aa5047a9e481f80b4334d2c7ceb9777d5f7438b2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 Oct 2023 10:49:55 -0400 Subject: [PATCH] Faster blockProject blockPromote --- Grid/lattice/Lattice_transfer.h | 100 ++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 19 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index a2e4982e..39fcb376 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -276,18 +276,33 @@ inline void blockProject(Lattice > &coarseData, autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( ip_ , ip, AcceleratorWrite); + RealD t_IP=0; + RealD t_co=0; + RealD t_za=0; for(int v=0;v + t_IP+=usecond(); + t_co-=usecond(); accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { convertType(coarseData_[sc](v),ip_[sc]); }); + t_co+=usecond(); // improve numerical stability of projection // |fine> = |fine> - |basis> ip=-ip; + t_za-=usecond(); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); + t_za+=usecond(); } + // std::cout << GridLogPerformance << " blockProject : blockInnerProduct : "< inline void batchBlockProject(std::vector>> &coarseData, const std::vector> &fineData, @@ -393,8 +408,15 @@ template Lattice coarse_inner(coarse); // Precision promotion + RealD t; + t=-usecond(); fine_inner = localInnerProductD(fineX,fineY); + // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "< convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); }); } + // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "< &ip,Lattice &fineX) template inline void blockSum(Lattice &coarseData,const Lattice &fineData) { + const int maxsubsec=256; + typedef iVector vSubsec; + GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); @@ -463,37 +489,62 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorRead); - auto coarseData_p = &coarseData_[0]; - auto fineData_p = &fineData_[0]; + auto coarseData_p = &coarseData_[0]; + auto fineData_p = &fineData_[0]; Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; vobj zz = Zero(); - - accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{ + // Somewhat lazy calculation + // Find the biggest power of two subsection divisor less than or equal to maxsubsec + int subsec=maxsubsec; + int subvol; + subvol=blockVol/subsec; + while(subvol*subsec!=blockVol){ + subsec = subsec/2; + subvol=blockVol/subsec; + }; + + Lattice coarseTmp(coarse); + autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard); + auto coarseTmp_p= &coarseTmp_[0]; + + // Sum within subsecs in a first kernel + accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{ + + int sc=sce/subsec; + int e=sce%subsec; + // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate auto cd = coalescedRead(zz); - - for(int sb=0;sboSites(),vobj::Nsimd(),{ + auto cd = coalescedRead(coarseTmp_p[sc](0)); + for(int e=1;e &ip,std::vector > blockOrthonormalize(ip,Basis); } -#if 0 +#ifdef GRID_ACCELERATED // TODO: CPU optimized version here template inline void blockPromote(const Lattice > &coarseData, @@ -576,26 +627,37 @@ inline void blockPromote(const Lattice > &coarseData, autoView( fineData_ , fineData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorRead); + typedef LatticeView Vview; + std::vector AcceleratorVecViewContainer_h; + for(int v=0;v AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis); + acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview)); + auto Basis_p = &AcceleratorVecViewContainer[0]; // Loop with a cache friendly loop ordering - accelerator_for(sf,fine->oSites(),1,{ + Coordinate frdimensions=fine->_rdimensions; + Coordinate crdimensions=coarse->_rdimensions; + accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + Lexicographic::CoorFromIndex(coor_f,sf,frdimensions); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + Lexicographic::IndexFromCoor(coor_c,sc,crdimensions); - for(int i=0;i inline void blockPromote(const Lattice > &coarseData, Lattice &fineData,