From 9aafd204683487795733c0fbe1456e9b84f50179 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 17 Dec 2019 05:01:39 -0500 Subject: [PATCH] Simple block project promote runs faster on GPU --- Grid/lattice/Lattice_transfer.h | 72 +++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 9e4003b0..0041f47a 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -1,5 +1,4 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/Lattice_transfer.h @@ -83,12 +82,35 @@ template inline void setCheckerboard(Lattice &full,const Latti }); } - template inline void blockProject(Lattice > &coarseData, + const Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + Lattice ip(coarse); + + // auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(); + auto ip_ = ip.View(); + for(int v=0;voSites(), vobj::Nsimd(), { + coalescedWrite(coarseData_[sc](v),ip_(sc)); + }); + } +} + +template +inline void blockProject1(Lattice > &coarseData, const Lattice &fineData, const std::vector > &Basis) { + typedef iVector coarseSiteData; + coarseSiteData elide; + typedef decltype(coalescedRead(elide)) ScalarComplex; GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); int _ndimension = coarse->_ndimension; @@ -116,11 +138,17 @@ inline void blockProject(Lattice > &coarseData, // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. // Otherwise do fine inner product per site, and make the update atomic //////////////////////////////////////////////////////////////////////////////////////////////////////// - accelerator_for( sc, coarse->oSites(), { + accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { + + auto sc=sci/nbasis; + auto i=sci%nbasis; + auto Basis_ = Basis[i].View(); Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + + int sf; + decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); for(int sb=0;sb > &coarseData, Lexicographic::CoorFromIndex(coor_b,sb,block_r); for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d]; Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions); - - for(int i=0;i &ip,std::vector > } } +#if 0 template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -349,13 +375,35 @@ inline void blockPromote(const Lattice > &coarseData, for(int i=0;i +inline void blockPromote(const Lattice > &coarseData, + Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + fineData=Zero(); + for(int i=0;i > ip = PeekIndex<0>(coarseData,i); + Lattice cip(coarse); + auto cip_ = cip.View(); + auto ip_ = ip.View(); + accelerator_for(sc,coarse->oSites(),1,{ + cip_[sc] = ip_[sc](); + }); + blockZAXPY(fineData,cip,Basis[i],fineData); + } +} +#endif // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local