From bb89a82a07be478d50060efd5c76cf71622af870 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 20:01:15 +0200 Subject: [PATCH 1/3] Staggered coalseced read --- Grid/qcd/action/fermion/StaggeredImpl.h | 16 ++-- Grid/qcd/action/fermion/WilsonImpl.h | 35 ++++++++- .../StaggeredKernelsImplementation.h | 76 ++++++++++--------- 3 files changed, 82 insertions(+), 45 deletions(-) diff --git a/Grid/qcd/action/fermion/StaggeredImpl.h b/Grid/qcd/action/fermion/StaggeredImpl.h index 8adf45a4..f44d12f4 100644 --- a/Grid/qcd/action/fermion/StaggeredImpl.h +++ b/Grid/qcd/action/fermion/StaggeredImpl.h @@ -72,19 +72,23 @@ public: StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){}; - static accelerator_inline void multLink(SiteSpinor &phi, + template + static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U, - const SiteSpinor &chi, + const _Spinor &chi, int mu) { - mult(&phi(), &U(mu), &chi()); + auto UU = coalescedRead(U(mu)); + mult(&phi(), &UU, &chi()); } - static accelerator_inline void multLinkAdd(SiteSpinor &phi, + template + static accelerator_inline void multLinkAdd(_Spinor &phi, const SiteDoubledGaugeField &U, - const SiteSpinor &chi, + const _Spinor &chi, int mu) { - mac(&phi(), &U(mu), &chi()); + auto UU = coalescedRead(U(mu)); + mac(&phi(), &UU, &chi()); } template diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 94676b6b..2ff6feba 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -184,18 +184,22 @@ public: mat = TraceIndex(P); } - inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds){ + inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds) + { for (int mu = 0; mu < Nd; mu++) mat[mu] = PeekIndex(Uds, mu); } - - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu) + { +#undef USE_OLD_INSERT_FORCE int Ls=Btilde.Grid()->_fdimensions[0]; + autoView( mat_v , mat, AcceleratorWrite); +#ifdef USE_OLD_INSERT_FORCE GaugeLinkField tmp(mat.Grid()); tmp = Zero(); { + const int Nsimd = SiteSpinor::Nsimd(); autoView( tmp_v , tmp, AcceleratorWrite); autoView( Btilde_v , Btilde, AcceleratorRead); autoView( Atilde_v , Atilde, AcceleratorRead); @@ -208,6 +212,29 @@ public: }); } PokeIndex(mat,tmp,mu); +#else + { + const int Nsimd = SiteSpinor::Nsimd(); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{ + int sU=sss; + typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; + ColorMatrixType sum; + zeroit(sum); + for(int s=0;s_is_local ) { \ - if (SE->_permute) { \ - chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ - } else { \ - chi_p = &in[SE->_offset]; \ - } \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ } else { \ - chi_p = &buf[SE->_offset]; \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ - multLink(Uchi, U[sU], *chi_p, Dir); + acceleratorSynchronise(); \ + multLink(Uchi, U[sU], chi, Dir); #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if (SE->_is_local ) { \ - if (SE->_permute) { \ - chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ - } else { \ - chi_p = &in[SE->_offset]; \ - } \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ } else if ( st.same_node[Dir] ) { \ - chi_p = &buf[SE->_offset]; \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ if (SE->_is_local || st.same_node[Dir] ) { \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], chi, Dir); \ } #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ nmu++; \ - chi_p = &buf[SE->_offset]; \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + chi = coalescedRead(buf[SE->_offset],lane); \ + multLink(Uchi, U[sU], chi, Dir); \ } template @@ -84,12 +77,14 @@ void StaggeredKernels::DhopSiteGeneric(StencilView &st, SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dag) { - const SiteSpinor *chi_p; - SiteSpinor chi; - SiteSpinor Uchi; + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int skew; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGeneric(StencilView &st, if ( dag ) { Uchi = - Uchi; } - vstream(out[sF], Uchi); + coalescedWrite(out[sF], Uchi,lane); } }; @@ -130,13 +125,16 @@ template accelerator_inline void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, - const FermionFieldView &in, FermionFieldView &out,int dag) { - const SiteSpinor *chi_p; - SiteSpinor chi; - SiteSpinor Uchi; + const FermionFieldView &in, FermionFieldView &out,int dag) +{ + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int skew ; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGenericInt(StencilView &st, if ( dag ) { Uchi = - Uchi; } - vstream(out[sF], Uchi); + coalescedWrite(out[sF], Uchi,lane); } }; @@ -178,14 +176,17 @@ template accelerator_inline void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, - const FermionFieldView &in, FermionFieldView &out,int dag) { - const SiteSpinor *chi_p; - // SiteSpinor chi; - SiteSpinor Uchi; + const FermionFieldView &in, FermionFieldView &out,int dag) +{ + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int nmu=0; int skew ; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGenericExt(StencilView &st, GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); } - if ( nmu ) { - if ( dag ) { - out[sF] = out[sF] - Uchi; + if ( nmu ) { + auto _out = coalescedRead(out[sF],lane); + if ( dag ) { + coalescedWrite(out[sF], _out-Uchi,lane); } else { - out[sF] = out[sF] + Uchi; + coalescedWrite(out[sF], _out+Uchi,lane); } } } @@ -261,6 +263,8 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); autoView( UUU_v , UUU, AcceleratorRead); autoView( U_v , U, AcceleratorRead); autoView( in_v , in, AcceleratorRead); @@ -301,6 +305,8 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); autoView( UUU_v , U, AcceleratorRead); autoView( U_v , U, AcceleratorRead); autoView( in_v , in, AcceleratorRead); From e9479929570e3b6e000f60b6368d7981666fac7c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 20:04:06 +0200 Subject: [PATCH 2/3] Improved force terms --- Grid/tensors/Tensor_outer.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/Grid/tensors/Tensor_outer.h b/Grid/tensors/Tensor_outer.h index 4902c22f..a32a2a91 100644 --- a/Grid/tensors/Tensor_outer.h +++ b/Grid/tensors/Tensor_outer.h @@ -34,6 +34,16 @@ NAMESPACE_BEGIN(Grid); // outerProduct Scalar x Scalar -> Scalar // Vector x Vector -> Matrix /////////////////////////////////////////////////////////////////////////////////////// +template = 0> +accelerator_inline CC outerProduct(const CC &l, const CC& r) +{ + return l*conj(r); +} +template = 0> +accelerator_inline RR outerProduct(const RR &l, const RR& r) +{ + return l*r; +} template accelerator_inline auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix @@ -57,17 +67,6 @@ auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar = 0> -accelerator_inline CC outerProduct(const CC &l, const CC& r) -{ - return l*conj(r); -} -template = 0> -accelerator_inline RR outerProduct(const RR &l, const RR& r) -{ - return l*r; -} - NAMESPACE_END(Grid); #endif From a7fb25adf66996f5de6861228774c803ef87bd4a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 21:44:14 +0200 Subject: [PATCH 3/3] Make Cshift fields static to avoid repeated reallocaate overhead --- Grid/cshift/Cshift_mpi.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 375d004e..7e93e260 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -122,8 +122,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf(buffer_size); - cshiftVector recv_buf(buffer_size); + static cshiftVector send_buf; send_buf.resize(buffer_size); + static cshiftVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -198,8 +198,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -294,8 +294,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf_v(buffer_size); - cshiftVector recv_buf_v(buffer_size); + static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); + static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); vobj *send_buf; vobj *recv_buf; { @@ -381,8 +381,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; {