From 736bf3c8664f83b613a8160734eb1fc58e3c0400 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 11:33:50 +0100 Subject: [PATCH] Major rework of stencil. Half precision and MPI3 now working. --- lib/qcd/action/fermion/WilsonCompressor.h | 42 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 52 +- lib/qcd/action/fermion/WilsonKernels.cc | 680 ++++++------------- lib/qcd/action/fermion/WilsonKernels.h | 69 +- lib/qcd/action/fermion/WilsonKernelsHand.cc | 618 ++++------------- lib/simd/Grid_qpx.h | 8 + lib/simd/Grid_sse4.h | 2 +- lib/simd/Grid_vector_types.h | 8 +- lib/stencil/Stencil.h | 692 ++++++++++---------- 9 files changed, 787 insertions(+), 1384 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index f1ad390d..59fcc1f8 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -241,13 +241,18 @@ public: typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + std::vector same_node; + WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances) - : CartesianStencil (grid,npoints,checkerboard,directions,distances) - { /*Do nothing*/ }; + : CartesianStencil (grid,npoints,checkerboard,directions,distances) , + same_node(npoints) + { + assert(npoints==8);// or 10 if do naive DWF 5d red black ? + }; template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) @@ -257,6 +262,7 @@ public: this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); this->CommsMerge(compress); + this->CommsMergeSHM(compress); } template @@ -295,23 +301,23 @@ public: int face_idx=0; if ( dag ) { // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); - this->HaloGatherDir(source,YpCompress,Yp,face_idx); - this->HaloGatherDir(source,ZpCompress,Zp,face_idx); - this->HaloGatherDir(source,TpCompress,Tp,face_idx); - this->HaloGatherDir(source,XmCompress,Xm,face_idx); - this->HaloGatherDir(source,YmCompress,Ym,face_idx); - this->HaloGatherDir(source,ZmCompress,Zm,face_idx); - this->HaloGatherDir(source,TmCompress,Tm,face_idx); + same_node[Xp]=this->HaloGatherDir(source,XpCompress,Xp,face_idx); + same_node[Yp]=this->HaloGatherDir(source,YpCompress,Yp,face_idx); + same_node[Zp]=this->HaloGatherDir(source,ZpCompress,Zp,face_idx); + same_node[Tp]=this->HaloGatherDir(source,TpCompress,Tp,face_idx); + same_node[Xm]=this->HaloGatherDir(source,XmCompress,Xm,face_idx); + same_node[Ym]=this->HaloGatherDir(source,YmCompress,Ym,face_idx); + same_node[Zm]=this->HaloGatherDir(source,ZmCompress,Zm,face_idx); + same_node[Tm]=this->HaloGatherDir(source,TmCompress,Tm,face_idx); } else { - this->HaloGatherDir(source,XmCompress,Xp,face_idx); - this->HaloGatherDir(source,YmCompress,Yp,face_idx); - this->HaloGatherDir(source,ZmCompress,Zp,face_idx); - this->HaloGatherDir(source,TmCompress,Tp,face_idx); - this->HaloGatherDir(source,XpCompress,Xm,face_idx); - this->HaloGatherDir(source,YpCompress,Ym,face_idx); - this->HaloGatherDir(source,ZpCompress,Zm,face_idx); - this->HaloGatherDir(source,TpCompress,Tm,face_idx); + same_node[Xp]=this->HaloGatherDir(source,XmCompress,Xp,face_idx); + same_node[Yp]=this->HaloGatherDir(source,YmCompress,Yp,face_idx); + same_node[Zp]=this->HaloGatherDir(source,ZmCompress,Zp,face_idx); + same_node[Tp]=this->HaloGatherDir(source,TmCompress,Tp,face_idx); + same_node[Xm]=this->HaloGatherDir(source,XpCompress,Xm,face_idx); + same_node[Ym]=this->HaloGatherDir(source,YpCompress,Ym,face_idx); + same_node[Zm]=this->HaloGatherDir(source,ZpCompress,Zm,face_idx); + same_node[Tm]=this->HaloGatherDir(source,TpCompress,Tm,face_idx); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d021b35c..9d325ed5 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -118,48 +118,6 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // Allocate the required comms buffer ImportGauge(_Umu); } - /* -template -WilsonFermion5D::WilsonFermion5D(int simd,GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - RealD _M5,const ImplParams &p) : -{ - int nsimd = Simd::Nsimd(); - - // some assertions - assert(FiveDimGrid._ndimension==5); - assert(FiveDimRedBlackGrid._ndimension==5); - assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction - assert(FourDimGrid._ndimension==4); - - // Dimension zero of the five-d is the Ls direction - Ls=FiveDimGrid._fdimensions[0]; - assert(FiveDimGrid._processors[0] ==1); - assert(FiveDimGrid._simd_layout[0] ==nsimd); - - assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); - assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); - - // Other dimensions must match the decomposition of the four-D fields - for(int d=0;d<4;d++){ - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - - assert(FourDimGrid._simd_layout[d]=1); - assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); - assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); - } - - { - } -} - */ template void WilsonFermion5D::Report(void) @@ -415,6 +373,10 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime+=usecond(); std::vector > reqs; + // Rely on async comms; start comms before merge of local data + st.CommunicateBegin(reqs); + st.CommsMergeSHM(compressor); + #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -426,7 +388,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg if ( me == 0 ) { DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); st.CommunicateComplete(reqs); DhopCommTime+=usecond(); } else { @@ -442,10 +403,13 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommsMerge(compressor); DhopFaceTime+=usecond(); + // Load imbalance alert. Should use dynamic schedule OMP for loop + // Perhaps create a list of only those sites with face work, and + // load balance process the list. #pragma omp parallel { int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); + int me = omp_get_thread_num(); int myoff, mywork; GridThread::GetWork(len,me,mywork,myoff,nthreads); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 6e72e089..6ed350cf 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -36,62 +36,78 @@ namespace QCD { int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; -#ifdef QPX -#include -#include -#include -#include -#endif - -void bgq_l1p_optimisation(int mode) -{ -#ifdef QPX -#undef L1P_CFG_PF_USR -#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ - - uint64_t cfg_pf_usr; - if ( mode ) { - cfg_pf_usr = - L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; - // if ( sizeof(Float) == sizeof(double) ) { - cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; - // } else { - // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; - // } - } else { - cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) - | L1P_CFG_PF_USR_dfetch_max_footprint(2) - | L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_stream_prefetch_enable; - } - *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; - -#endif - -} - - template WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// // Generic implementation; move to different file? //////////////////////////////////////////// + +#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); + +#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + } +#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + chi_p = &buf[SE->_offset]; \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ + } + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + if (SE->_is_local && SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else if (SE->_is_local) { \ + spProj(chi, in._odata[SE->_offset]); \ + } else { \ + chi = buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); \ + Recon(result, Uchi); \ + } + + //////////////////////////////////////////////////////////////////// + // All legs kernels ; comms then compute + //////////////////////////////////////////////////////////////////// template void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out, - int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -100,174 +116,22 @@ void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; -// Need controls to do interior, exterior, or both template void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out,int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -276,168 +140,123 @@ void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, Do StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; + //////////////////////////////////////////////////////////////////// + // Interior kernels + //////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + + result=zero; + GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; + +template +void WilsonKernels::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + result=zero; + GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; +//////////////////////////////////////////////////////////////////// +// Exterior kernels +//////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; + +template +void WilsonKernels::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; template void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, @@ -451,119 +270,14 @@ void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal int ptype; SE = st.GetEntry(ptype, dir, sF); - - // Xp - if (gamma == Xp) { - if (SE->_is_local && SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - - // Yp - if (gamma == Yp) { - if (SE->_is_local && SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYp(result, Uchi); - } - - // Zp - if (gamma == Zp) { - if (SE->_is_local && SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZp(result, Uchi); - } - - // Tp - if (gamma == Tp) { - if (SE->_is_local && SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTp(result, Uchi); - } - - // Xm - if (gamma == Xm) { - if (SE->_is_local && SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXm(result, Uchi); - } - - // Ym - if (gamma == Ym) { - if (SE->_is_local && SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYm(result, Uchi); - } - - // Zm - if (gamma == Zm) { - if (SE->_is_local && SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZm(result, Uchi); - } - - // Tm - if (gamma == Tm) { - if (SE->_is_local && SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTm(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); + GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); + GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); + GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); + GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm); + GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm); + GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm); + GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm); vstream(out._odata[sF], result); } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 20ee87f2..e0aa08b0 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -34,8 +34,6 @@ directory namespace Grid { namespace QCD { -void bgq_l1p_optimisation(int mode); - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Helper routines that implement Wilson stencil for a single site. // Common to both the WilsonFermion and WilsonFermion5D @@ -44,9 +42,8 @@ class WilsonKernelsStatic { public: enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { CommsAndCompute, CommsThenCompute }; - // S-direction is INNERMOST and takes no part in the parity. - static int Opt; // these are a temporary hack - static int Comms; // these are a temporary hack + static int Opt; + static int Comms; }; template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { @@ -75,7 +72,7 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -84,7 +81,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -99,11 +99,14 @@ public: template typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { // no kernel choice for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -113,8 +116,8 @@ public: template typename std::enable_if::type DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { - + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) +{ bgq_l1p_optimisation(1); switch(Opt) { #if defined(AVX512) || defined (QPX) @@ -128,7 +131,7 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -137,7 +140,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -156,7 +162,10 @@ public: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -169,35 +178,47 @@ public: private: // Specialised variants void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); public: diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index b7923b9f..f12d94c7 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -307,55 +307,106 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -namespace Grid { -namespace QCD { +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + { \ + MULT_2SPIN(DIR); \ + } \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,Dir,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if((!SE->_is_local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out._odata[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + } -template void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - REGISTER Simd result_00; // 12 regs on knc - REGISTER Simd result_01; - REGISTER Simd result_02; - - REGISTER Simd result_10; - REGISTER Simd result_11; - REGISTER Simd result_12; - - REGISTER Simd result_20; - REGISTER Simd result_21; - REGISTER Simd result_22; - - REGISTER Simd result_30; - REGISTER Simd result_31; - REGISTER Simd result_32; // 20 left - - REGISTER Simd Chi_00; // two spinor; 6 regs - REGISTER Simd Chi_01; - REGISTER Simd Chi_02; - - REGISTER Simd Chi_10; - REGISTER Simd Chi_11; - REGISTER Simd Chi_12; // 14 left - - REGISTER Simd UChi_00; // two spinor; 6 regs - REGISTER Simd UChi_01; - REGISTER Simd UChi_02; - - REGISTER Simd UChi_10; - REGISTER Simd UChi_11; - REGISTER Simd UChi_12; // 8 left - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -370,430 +421,54 @@ WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge #define Chimu_31 UChi_11 #define Chimu_32 UChi_12 +namespace Grid { +namespace QCD { + +template void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); int offset,local,perm, ptype; StencilEntry *SE; - // Xp - SE=st.GetEntry(ptype,Xp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xp); - } - XM_RECON; - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YM_RECON_ACCUM; - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZM_RECON_ACCUM; - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TM_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XP_RECON_ACCUM; - - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YP_RECON_ACCUM; - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZP_RECON_ACCUM; - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TP_RECON_ACCUM; - - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); } template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) + int ss,int sU,const FermionField &in, FermionField &out) { - // std::cout << "Hand op Dhop "<_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - - { - MULT_2SPIN(Xp); - } - XP_RECON; - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YP_RECON_ACCUM; - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZP_RECON_ACCUM; - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TP_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XM_RECON_ACCUM; - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YM_RECON_ACCUM; - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZM_RECON_ACCUM; - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TM_RECON_ACCUM; - - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); } //////////////////////////////////////////////// @@ -801,74 +476,71 @@ void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub //////////////////////////////////////////////// template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } - - ////////////// Wilson ; uses this implementation ///////////////////// -// Need Nc=3 though // #define INSTANTIATE_THEM(A) \ template void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \ -template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 757c84c9..cbca9118 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -33,6 +33,14 @@ #include "Grid_generic_types.h" // Definitions for simulated integer SIMD. namespace Grid { + +#ifdef QPX +#include +#include +#include +#include +#endif + namespace Optimization { typedef struct { diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 59ad996a..2fb2df76 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -411,7 +411,7 @@ namespace Optimization { hp[3] = sfw_float_to_half(fp[3]); return ret; } - static inline __m128i Grid_mm_cvtph_ps(__m128i h,int discard) { + static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) { __m128 ret=_mm_setzero_ps(); float *fp = (float *)&ret; Grid_half *hp = (Grid_half *)&h; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index ac17d1e0..de4a13b5 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -53,12 +53,14 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#if defined QPX -#include "Grid_qpx.h" -#endif #ifdef NEONv8 #include "Grid_neon.h" #endif +#if defined QPX +#include "Grid_qpx.h" +#endif + +#include "l1p.h" namespace Grid { diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 81138fc6..32d1255a 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -36,35 +36,16 @@ // gather to a point stencil code. CSHIFT is not the best way, so need // additional stencil support. // - // Stencil based code will pre-exchange haloes and use a table lookup for neighbours. + // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. - // Overlap of comms and compute could be semi-automated by tabulating off-node connected, - // and - // - // Lattice could also allocate haloes which get used for stencil code. - // - // Grid could create a neighbour index table for a given stencil. - // - // Could also implement CovariantCshift, to fuse the loops and enhance performance. - // - // - // General stencil computation: - // + // Overlap of comms and compute is enabled by tabulating off-node connected, + // // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. - // // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each // - // Could take a - // SpinProjectFaces - // start comms - // complete comms - // Reconstruct Umu - // - // Approach. - // ////////////////////////////////////////////////////////////////////////////////////////// namespace Grid { @@ -108,27 +89,17 @@ void Gather_plane_exchange_table(std::vector >& table,const L } } -struct StencilEntry { - uint64_t _offset; - uint64_t _byte_offset; - uint16_t _is_local; - uint16_t _permute; - uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline -}; + struct StencilEntry { + uint64_t _offset; + uint64_t _byte_offset; + uint16_t _is_local; + uint16_t _permute; + uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline + }; -////////////////////////////////////////////////////////////////////////////////// -//Lattice object type, compressed object type. -// -//These only need to be known outside of halo exchange subset of methods -//because byte offsets are precomputed -// -//It might help/be cleaner to add a "mobj" for the mpi transferred object to this list -//for the on the wire representation. -// -//This would clean up the "casts" in the WilsonCompressor.h file -// -//Other compressors (SimpleCompressor) retains mobj == cobj so no issue -////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////// +// The Stencil Class itself +//////////////////////////////////////// template class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -139,6 +110,77 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; + /////////////////////////////////////////// + // Helper structs + /////////////////////////////////////////// + struct Packet { + void * send_buf; + void * recv_buf; + Integer to_rank; + Integer from_rank; + Integer bytes; + }; + struct Merge { + cobj * mpointer; + std::vector rpointers; + std::vector vpointers; + Integer buffer_size; + Integer type; + }; + struct Decompress { + cobj * kernel_p; + cobj * mpi_p; + Integer buffer_size; + }; + + //////////////////////////////////////// + // Basic Grid and stencil info + //////////////////////////////////////// + int face_table_computed; + std::vector > > face_table ; + + + int _checkerboard; + int _npoints; // Move to template param? + GridBase * _grid; + + // npoints of these + std::vector _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + Vector _entries; + std::vector Packets; + std::vector Mergers; + std::vector MergersSHM; + std::vector Decompressions; + std::vector DecompressionsSHM; + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + // Vectors that live on the symmetric heap in case of SHMEM + // These are used; either SHM objects or refs to the above symmetric heap vectors + // depending on comms target + cobj* u_recv_buf_p; + cobj* u_send_buf_p; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; + + int u_comm_offset; + int _unified_buffer_size; +/* + std::vector compute2sites; + const std::vector &Compute2Sites(void) { return compute2sites; } + void CalculateCompute2Sites(void) { + for(int i=0;i< _entries.size();i++){ + + } + } +*/ + cobj *CommBuf(void) { return u_recv_buf_p; } + ///////////////////////////////////////// // Timing info; ugly; possibly temporary ///////////////////////////////////////// @@ -152,193 +194,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; - - void ZeroCounters(void) { - gathertime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - decompresstime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { -#define AVERAGE(A) _grid->GlobalSum(A);A/=NP; -#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; - RealD NN = _grid->NodeCount(); - - _grid->GlobalSum(commtime); commtime/=NP; - if ( calls > 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"< Packets; - - int face_table_computed; - std::vector > > face_table ; - - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - Packets.push_back(p); - } - - void CommunicateBegin(std::vector > &reqs) - { - reqs.resize(Packets.size()); - commtime-=usecond(); - for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - } - commtime+=usecond(); - } - void CommunicateComplete(std::vector > &reqs) - { - commtime-=usecond(); - for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); - } - _grid->StencilBarrier();// Synch shared memory on a single nodes - commtime+=usecond(); - } - - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - std::vector vpointers; - Integer buffer_size; - Integer type; - }; - - std::vector Mergers; - - struct Decompress { - cobj * kernel_p; - cobj * mpi_p; - Integer buffer_size; - }; - - void Prepare(void) - { - Decompressions.resize(0); - Mergers.resize(0); - Packets.resize(0); - calls++; - } - std::vector Decompressions; - - void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size) { - Decompress d; - d.kernel_p = k_p; - d.mpi_p = m_p; - d.buffer_size = buffer_size; - Decompressions.push_back(d); - } - - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type) { - Merge m; - m.type = type; - m.mpointer = merge_p; - m.vpointers= rpointers; - m.buffer_size = buffer_size; - Mergers.push_back(m); - } - - template - void CommsMerge(decompressor decompress) { - - // Also do a precision convert possibly on a receive buffer - for(int i=0;i _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; - - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - - void PrecomputeByteOffsets(void){ - for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); - } else { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); - } - } - }; inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; @@ -362,23 +221,196 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if (local) return base + _entries[ent]._byte_offset; else return cbase + _entries[ent]._byte_offset; } - - /////////////////////////////////////////////////////////// - // Unified Comms buffers for all directions - /////////////////////////////////////////////////////////// - // Vectors that live on the symmetric heap in case of SHMEM - // These are used; either SHM objects or refs to the above symmetric heap vectors - // depending on comms target - cobj* u_recv_buf_p; - cobj* u_send_buf_p; - std::vector u_simd_send_buf; - std::vector u_simd_recv_buf; - int u_comm_offset; - int _unified_buffer_size; - - cobj *CommBuf(void) { return u_recv_buf_p; } + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// + void CommunicateBegin(std::vector > &reqs) + { + reqs.resize(Packets.size()); + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + } + commtime+=usecond(); + } + void CommunicateComplete(std::vector > &reqs) + { + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + } + _grid->StencilBarrier();// Synch shared memory on a single nodes + commtime+=usecond(); + } + + template void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + Prepare(); + HaloGather(source,compress); + CommunicateBegin(reqs); + CommunicateComplete(reqs); + CommsMergeSHM(compress); + CommsMerge(compress); + } + + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int same_node = 1; + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + same_node = same_node && GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + // if checkerboard is unfavourable take two passes + // both with block stride loop iteration + same_node = same_node && GatherSimd(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && GatherSimd(source,dimension,shift,0x2,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } + } + } + return same_node; + } + + template + void HaloGather(const Lattice &source,compressor &compress) + { + _grid->StencilBarrier();// Synch shared memory on a single nodes + + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); + } + + ///////////////////////// + // Implementation + ///////////////////////// + void Prepare(void) + { + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); + Packets.resize(0); + calls++; + } + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + Packets.push_back(p); + } + void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector &dv) { + Decompress d; + d.kernel_p = k_p; + d.mpi_p = m_p; + d.buffer_size = buffer_size; + dv.push_back(d); + } + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { + Merge m; + m.type = type; + m.mpointer = merge_p; + m.vpointers= rpointers; + m.buffer_size = buffer_size; + mv.push_back(m); + } + template void CommsMerge(decompressor decompress) { CommsMerge(decompress,Mergers,Decompressions); } + template void CommsMergeSHM(decompressor decompress) { CommsMerge(decompress,MergersSHM,DecompressionsSHM);} + + template + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { + + for(int i=0;i void HaloExchange(const Lattice &source,compressor &compress) - { - std::vector > reqs; - Prepare(); - HaloGather(source,compress); - CommunicateBegin(reqs); - CommunicateComplete(reqs); - CommsMerge(compress); - } - - template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x3,compress,face_idx); - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x3,compress,face_idx); - nosplicetime+=usecond(); - } - } else { - if(splice_dim){ - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes - GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x1,compress,face_idx); - Gather(source,dimension,shift,0x2,compress,face_idx); - nosplicetime+=usecond(); - } - } - } - } - template - void HaloGather(const Lattice &source,compressor &compress) - { - _grid->StencilBarrier();// Synch shared memory on a single nodes - - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; @@ -804,6 +752,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + int shm_receive_only = 1; for(int x=0;xShmBufferTranslate(xmit_to_rank,u_recv_buf_p); + cobj *send_buf; + cobj *recv_buf; + if ( compress.DecompressionStep() ) { + recv_buf=u_simd_recv_buf[0]; + } else { + recv_buf=u_recv_buf_p; + } + + send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); if ( send_buf==NULL ) { send_buf = u_send_buf_p; + shm_receive_only = 0; } + + // Find out if we get the direct copy. + void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_send_buf_p); + if (success==NULL) { + // we found a packet that comes from MPI and contributes to this leg of stencil + shm_receive_only = 0; + } - cobj *recv_buf; gathertime-=usecond(); assert(send_buf!=NULL); Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); if ( compress.DecompressionStep() ) { - - recv_buf = &u_simd_recv_buf[0][u_comm_offset]; - AddDecompress(&u_recv_buf_p[u_comm_offset], - &recv_buf[u_comm_offset], - words); + if ( shm_receive_only ) { // Early decompress before MPI is finished is possible + AddDecompress(&u_recv_buf_p[u_comm_offset], + &recv_buf[u_comm_offset], + words,DecompressionsSHM); + } else { // Decompress after MPI is finished + AddDecompress(&u_recv_buf_p[u_comm_offset], + &recv_buf[u_comm_offset], + words,Decompressions); + } AddPacket((void *)&send_buf[u_comm_offset], (void *)&recv_buf[u_comm_offset], @@ -868,10 +836,11 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal u_comm_offset+=words; } } + return shm_receive_only; } template - void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { const int Nsimd = _grid->Nsimd(); @@ -917,12 +886,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); // loop over outer coord planes orthog to dim + int shm_receive_only = 1; for(int x=0;x= rd ); if ( any_offnode ) { - for(int i=0;iShmBufferTranslate(recv_from_rank,sp); if (shm==NULL) { shm = rp; + shm_receive_only = 0; // we found a packet that comes from MPI and contributes to this + // leg of stencil } - // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node // assuming above pointer flip + rpointers[i] = shm; + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - rpointers[i] = shm; } else { @@ -983,12 +954,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } } - AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type); + if ( shm_receive_only ) { + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); + } else { + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + } u_comm_offset +=buffer_size; } } + return shm_receive_only; } + + void ZeroCounters(void) { + gathertime = 0.; + commtime = 0.; + halogtime = 0.; + mergetime = 0.; + decompresstime = 0.; + gathermtime = 0.; + splicetime = 0.; + nosplicetime = 0.; + comms_bytes = 0.; + calls = 0.; + }; + + void Report(void) { +#define AVERAGE(A) _grid->GlobalSum(A);A/=NP; +#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; + RealD NN = _grid->NodeCount(); + + _grid->GlobalSum(commtime); commtime/=NP; + if ( calls > 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<