diff --git a/lib/lattice/Lattice_ET.h b/lib/lattice/Lattice_ET.h index 947d7da2..d5d8ec8d 100644 --- a/lib/lattice/Lattice_ET.h +++ b/lib/lattice/Lattice_ET.h @@ -52,9 +52,9 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru const int Nsimd = vobj::vector_type::Nsimd(); - std::vector mask(Nsimd); - std::vector truevals(Nsimd); - std::vector falsevals(Nsimd); + ExtractBuffer mask(Nsimd); + ExtractBuffer truevals(Nsimd); + ExtractBuffer falsevals(Nsimd); extract(iftrue, truevals); extract(iffalse, falsevals); diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h index 7940e87d..54f74a94 100644 --- a/lib/lattice/Lattice_base.h +++ b/lib/lattice/Lattice_base.h @@ -382,11 +382,13 @@ public: }; // class Lattice template std::ostream& operator<< (std::ostream& stream, const Lattice &o){ - std::vector gcoor; typedef typename vobj::scalar_object sobj; - sobj ss; for(int g=0;g_gsites;g++){ + + Coordinate gcoor; o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); + + sobj ss; peekSite(ss,o,gcoor); stream<<"["; for(int d=0;d = 0> inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) { typedef typename vsimd::scalar_type scalar; - std::vector vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation - std::vector vrhs(vsimd::Nsimd()); - std::vector vpred(vsimd::Nsimd()); + ExtractBuffer vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation + ExtractBuffer vrhs(vsimd::Nsimd()); + ExtractBuffer vpred(vsimd::Nsimd()); vInteger ret; extract(lhs,vlhs); extract(rhs,vrhs); @@ -153,8 +153,8 @@ template = 0> inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) { typedef typename vsimd::scalar_type scalar; - std::vector vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation - std::vector vpred(vsimd::Nsimd()); + ExtractBuffer vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation + ExtractBuffer vpred(vsimd::Nsimd()); vInteger ret; extract(lhs,vlhs); for(int s=0;s = 0> inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) { typedef typename vsimd::scalar_type scalar; - std::vector vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation - std::vector vpred(vsimd::Nsimd()); + ExtractBuffer vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation + ExtractBuffer vpred(vsimd::Nsimd()); vInteger ret; extract(rhs,vrhs); for(int s=0;s inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - std::vector gcoor; - std::vector mergebuf(Nsimd); + Coordinate gcoor; + ExtractBuffer mergebuf(Nsimd); vector_type vI; for(int o=0;ooSites();o++){ diff --git a/lib/lattice/Lattice_peekpoke.h b/lib/lattice/Lattice_peekpoke.h index d52c2754..487b9e44 100644 --- a/lib/lattice/Lattice_peekpoke.h +++ b/lib/lattice/Lattice_peekpoke.h @@ -84,7 +84,7 @@ void PokeIndex(Lattice &lhs,const Lattice(lhs[0] // Poke a scalar object into the SIMD array ////////////////////////////////////////////////////// template -void pokeSite(const sobj &s,Lattice &l,const std::vector &site){ +void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ GridBase *grid=l.Grid(); @@ -101,9 +101,8 @@ void pokeSite(const sobj &s,Lattice &l,const std::vector &site){ grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->Broadcast(grid->BossRank(),s); - std::vector buf(Nsimd); - // extract-modify-merge cycle is easiest way and this is not perf critical + ExtractBuffer buf(Nsimd); if ( rank == grid->ThisRank() ) { extract(l[odx],buf); buf[idx] = s; @@ -118,7 +117,7 @@ void pokeSite(const sobj &s,Lattice &l,const std::vector &site){ // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// template -void peekSite(sobj &s,const Lattice &l,const std::vector &site){ +void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ GridBase *grid=l.Grid(); @@ -132,7 +131,7 @@ void peekSite(sobj &s,const Lattice &l,const std::vector &site){ int rank,odx,idx; grid->GlobalCoorToRankIndex(rank,odx,idx,site); - std::vector buf(Nsimd); + ExtractBuffer buf(Nsimd); extract(l[odx],buf); s = buf[idx]; @@ -147,7 +146,7 @@ void peekSite(sobj &s,const Lattice &l,const std::vector &site){ // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// template -void peekLocalSite(sobj &s,const Lattice &l,std::vector &site){ +void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ GridBase *grid = l.Grid(); @@ -175,7 +174,7 @@ void peekLocalSite(sobj &s,const Lattice &l,std::vector &site){ }; template -void pokeLocalSite(const sobj &s,Lattice &l,std::vector &site){ +void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ GridBase *grid=l.Grid(); diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index cfcf5ade..35ea1e95 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -123,7 +123,7 @@ inline typename vobj::scalar_object sum(const Lattice &arg) typedef typename vobj::scalar_object sobj; sobj ssum; zeroit(ssum); - std::vector buf(Nsimd); + ExtractBuffer buf(Nsimd); extract(vsum,buf); for(int i=0;i inline void sliceSum(const Lattice &Data,std::vector< std::vector > lvSum(rd); // will locally sum vectors first std::vector lsSum(ld,Zero()); // sum across these down to scalars - std::vector extracted(Nsimd); // splitting the SIMD + ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r inline void sliceSum(const Lattice &Data,std::vector< }); // Sum across simd lanes in the plane, breaking out orthog dir. - std::vector icoor(Nd); + Coordinate icoor(Nd); for(int rt=0;rt & result, const Latti std::vector > lvSum(rd); // will locally sum vectors first std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars - std::vector > extracted(Nsimd); // splitting the SIMD + ExtractBuffer > extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r & result, const Latti }); // Sum across simd lanes in the plane, breaking out orthog dir. - std::vector icoor(Nd); + Coordinate icoor(Nd); for(int rt=0;rt temp; @@ -341,7 +341,7 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice int e2 =grid->_slice_block [orthogdim]; int stride =grid->_slice_stride[orthogdim]; - std::vector icoor; + Coordinate icoor; for(int r=0;r buf(Nsimd); + ExtractBuffer buf(Nsimd); for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times int sm = multiplicity * ss + m; // Maps the generator site to the fine site @@ -392,8 +392,8 @@ public: int rank; int o_idx; int i_idx; - std::vector gcoor; + Coordinate gcoor; _grid->GlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); @@ -456,8 +456,8 @@ public: uint32_t the_number; // who - std::vector gcoor; int rank,o_idx,i_idx; + Coordinate gcoor; _grid->GlobalIndexToGlobalCoor(gsite,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 1ee91a52..ba23b0cb 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -53,7 +53,7 @@ template inline void pickCheckerboard(int cb,Lattice &half,con thread_loop( (int ss=0;ssoSites();ss++),{ int cbos; - std::vector coor; + Coordinate coor; full.Grid()->oCoorFromOindex(coor,ss); cbos=half.Grid()->CheckerBoard(coor); @@ -66,7 +66,7 @@ template inline void pickCheckerboard(int cb,Lattice &half,con template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.Checkerboard(); thread_loop( (int ss=0;ssoSites();ss++), { - std::vector coor; + Coordinate coor; int cbos; full.Grid()->oCoorFromOindex(coor,ss); @@ -96,7 +96,7 @@ inline void blockProject(Lattice > &coarseData, conformable(Basis[i],fineData); } - std::vector block_r (_ndimension); + Coordinate block_r (_ndimension); for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; @@ -109,8 +109,8 @@ inline void blockProject(Lattice > &coarseData, thread_loop( (int sf=0;sfoSites();sf++),{ int sc; - std::vector coor_c(_ndimension); - std::vector coor_f(_ndimension); + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); @@ -143,7 +143,7 @@ inline void blockZAXPY(Lattice &fineZ, int _ndimension = coarse->_ndimension; - std::vector block_r (_ndimension); + Coordinate block_r (_ndimension); // FIXME merge with subdivide checking routine as this is redundant for(int d=0 ; d<_ndimension;d++){ @@ -154,8 +154,8 @@ inline void blockZAXPY(Lattice &fineZ, thread_loop( (int sf=0;sfoSites();sf++),{ int sc; - std::vector coor_c(_ndimension); - std::vector coor_f(_ndimension); + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; @@ -209,7 +209,7 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) int _ndimension = coarse->_ndimension; - std::vector block_r (_ndimension); + Coordinate block_r (_ndimension); for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; @@ -221,8 +221,8 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) thread_region { int sc; - std::vector coor_c(_ndimension); - std::vector coor_f(_ndimension); + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); thread_loop_in_region( (int sf=0;sfoSites();sf++),{ @@ -240,7 +240,7 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) } template -inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice &picked,std::vector coor) +inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice &picked,Coordinate coor) { GridBase * fine = unpicked.Grid(); @@ -301,7 +301,7 @@ inline void blockPromote(const Lattice > &coarseData, conformable(Basis[i].Grid(),fine); } - std::vector block_r (_ndimension); + Coordinate block_r (_ndimension); for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; @@ -310,8 +310,8 @@ inline void blockPromote(const Lattice > &coarseData, // Loop with a cache friendly loop ordering thread_region { int sc; - std::vector coor_c(_ndimension); - std::vector coor_f(_ndimension); + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); thread_loop_in_region( (int sf=0;sfoSites();sf++),{ @@ -355,7 +355,7 @@ void localConvert(const Lattice &in,Lattice &out) sobj s; ssobj ss; - std::vector lcoor(ni); + Coordinate lcoor(ni); ig->LocalIndexToLocalCoor(idx,lcoor); peekLocalSite(s,in,lcoor); ss=s; @@ -391,8 +391,8 @@ void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice // the above should guarantee that the operations are local thread_loop( (int idx=0;idxlSites();idx++),{ sobj s; - std::vector lcoor(nl); - std::vector hcoor(nh); + Coordinate lcoor(nl); + Coordinate hcoor(nh); lg->LocalIndexToLocalCoor(idx,lcoor); int ddl=0; hcoor[orthog] = slice; @@ -432,8 +432,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic // the above should guarantee that the operations are local thread_loop((int idx=0;idxlSites();idx++),{ sobj s; - std::vector lcoor(nl); - std::vector hcoor(nh); + Coordinate lcoor(nl); + Coordinate hcoor(nh); lg->LocalIndexToLocalCoor(idx,lcoor); int ddl=0; hcoor[orthog] = slice; @@ -471,8 +471,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int // the above should guarantee that the operations are local thread_loop( (int idx=0;idxlSites();idx++),{ sobj s; - std::vector lcoor(nl); - std::vector hcoor(nh); + Coordinate lcoor(nl); + Coordinate hcoor(nh); lg->LocalIndexToLocalCoor(idx,lcoor); if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; @@ -506,8 +506,8 @@ void ExtractSliceLocal(Lattice &lowDim, Lattice & higherDim,int slic // the above should guarantee that the operations are local thread_loop( (int idx=0;idxlSites();idx++),{ sobj s; - std::vector lcoor(nl); - std::vector hcoor(nh); + Coordinate lcoor(nl); + Coordinate hcoor(nh); lg->LocalIndexToLocalCoor(idx,lcoor); if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; @@ -533,14 +533,14 @@ void Replicate(Lattice &coarse,Lattice & fine) assert(cg->_ndimension==fg->_ndimension); - std::vector ratio(cg->_ndimension); + Coordinate ratio(cg->_ndimension); for(int d=0;d_ndimension;d++){ ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d]; } - std::vector fcoor(nd); - std::vector ccoor(nd); + Coordinate fcoor(nd); + Coordinate ccoor(nd); for(int g=0;ggSites();g++){ fg->GlobalIndexToGlobalCoor(g,fcoor); @@ -569,7 +569,7 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) int ndim = in_grid->Nd(); int in_nsimd = vtype::Nsimd(); - std::vector > in_icoor(in_nsimd); + std::vector in_icoor(in_nsimd); for(int lane=0; lane < in_nsimd; lane++){ in_icoor[lane].resize(ndim); @@ -579,12 +579,12 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) //loop over outer index thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //Assemble vector of pointers to output elements - std::vector out_ptrs(in_nsimd); + ExtractPointerArray out_ptrs(in_nsimd); - std::vector in_ocoor(ndim); + Coordinate in_ocoor(ndim); in_grid->oCoorFromOindex(in_ocoor, in_oidx); - std::vector lcoor(in_grid->Nd()); + Coordinate lcoor(in_grid->Nd()); for(int lane=0; lane < in_nsimd; lane++){ for(int mu=0;mu &out, const Lattice &in) //Unpack into those ptrs const vobj & in_vobj = in[in_oidx]; - extract1(in_vobj, out_ptrs, 0); + extract(in_vobj, out_ptrs, 0); }); } //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order @@ -612,10 +612,10 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) GridBase* grid = out.Grid(); assert(in.size()==grid->lSites()); - int ndim = grid->Nd(); - int nsimd = vtype::Nsimd(); + const int ndim = grid->Nd(); + constexpr int nsimd = vtype::Nsimd(); - std::vector > icoor(nsimd); + std::vector icoor(nsimd); for(int lane=0; lane < nsimd; lane++){ icoor[lane].resize(ndim); @@ -624,12 +624,11 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //Assemble vector of pointers to output elements - std::vector ptrs(nsimd); + ExtractPointerArray ptrs(nsimd); - std::vector ocoor(ndim); + Coordinate ocoor(ndim); + Coordinate lcoor(ndim); grid->oCoorFromOindex(ocoor, oidx); - - std::vector lcoor(grid->Nd()); for(int lane=0; lane < nsimd; lane++){ @@ -644,7 +643,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) //pack from those ptrs vobj vecobj; - merge1(vecobj, ptrs, 0); + merge(vecobj, ptrs, 0); out[oidx] = vecobj; }); } @@ -664,7 +663,7 @@ void precisionChange(Lattice &out, const Lattice &in){ int ndim = out.Grid()->Nd(); int out_nsimd = out_grid->Nsimd(); - std::vector > out_icoor(out_nsimd); + std::vector out_icoor(out_nsimd); for(int lane=0; lane < out_nsimd; lane++){ out_icoor[lane].resize(ndim); @@ -675,12 +674,12 @@ void precisionChange(Lattice &out, const Lattice &in){ unvectorizeToLexOrdArray(in_slex_conv, in); thread_loop( (uint64_t out_oidx=0;out_oidxoSites();out_oidx++),{ - std::vector out_ocoor(ndim); + Coordinate out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); - std::vector ptrs(out_nsimd); + ExtractPointerArray ptrs(out_nsimd); - std::vector lcoor(out_grid->Nd()); + Coordinate lcoor(out_grid->Nd()); for(int lane=0; lane < out_nsimd; lane++){ for(int mu=0;mu > & full,Lattice & split) assert(nvector*split_nproc==full_nproc); assert(nvector == full_vecs); - std::vector ratio(ndim); + Coordinate ratio(ndim); for(int d=0;d_processors[d]/ split_grid->_processors[d]; } @@ -797,7 +796,7 @@ void Grid_split(std::vector > & full,Lattice & split) } int nvec = nvector; // Counts down to 1 as we collapse dims - std::vector ldims = full_grid->_ldimensions; + Coordinate ldims = full_grid->_ldimensions; for(int d=ndim-1;d>=0;d--){ @@ -824,7 +823,7 @@ void Grid_split(std::vector > & full,Lattice & split) // Loop over reordered data post A2A thread_loop( (int c=0;c coor(ndim); + Coordinate coor(ndim); for(int m=0;m > & full,Lattice & split) assert(nvector*split_nproc==full_nproc); assert(nvector == full_vecs); - std::vector ratio(ndim); + Coordinate ratio(ndim); for(int d=0;d_processors[d]/ split_grid->_processors[d]; } @@ -923,7 +922,7 @@ void Grid_unsplit(std::vector > & full,Lattice & split) int nvec = 1; uint64_t rsites = split_grid->lSites(); - std::vector rdims = split_grid->_ldimensions; + Coordinate rdims = split_grid->_ldimensions; for(int d=0;d > & full,Lattice & split) { // Loop over reordered data post A2A thread_loop( (int c=0;c coor(ndim); + Coordinate coor(ndim); for(int m=0;m