From c9fadf97a52087610c461af0ab6ac09d0eb2a21a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 17 Feb 2016 18:16:45 -0600 Subject: [PATCH] Simplify the compressor interface again. --- lib/AlignedAllocator.h | 8 + lib/Stencil.h | 37 ++-- lib/algorithms/CoarsenedMatrix.h | 2 +- lib/cshift/Cshift_common.h | 17 +- lib/qcd/action/fermion/FermionOperatorImpl.h | 4 +- lib/qcd/action/fermion/WilsonCompressor.h | 174 ++++++++++++++++++- lib/qcd/action/fermion/WilsonFermion5D.cc | 8 +- lib/tensors/Tensor_extract_merge.h | 24 +-- 8 files changed, 228 insertions(+), 46 deletions(-) diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h index 32cc3894..6faf6302 100644 --- a/lib/AlignedAllocator.h +++ b/lib/AlignedAllocator.h @@ -108,6 +108,14 @@ public: #endif #endif + _Tp tmp; +#undef FIRST_TOUCH_OPTIMISE +#ifdef FIRST_TOUCH_OPTIMISE +#pragma omp parallel for + for(int i=0;i<__n;i++){ + ptr[i]=tmp; + } +#endif return ptr; } diff --git a/lib/Stencil.h b/lib/Stencil.h index 0fb0fb5f..c2f083ff 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -77,7 +77,7 @@ namespace Grid { int _around_the_world; }; - template + template class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -580,6 +580,7 @@ PARALLEL_FOR_LOOP } + template std::thread HaloExchangeBegin(const Lattice &source,compressor &compress) { Mergers.resize(0); Packets.resize(0); @@ -587,6 +588,7 @@ PARALLEL_FOR_LOOP return std::thread([&] { this->Communicate(); }); } + template void HaloExchange(const Lattice &source,compressor &compress) { auto thr = HaloExchangeBegin(source,compress); @@ -601,20 +603,9 @@ PARALLEL_FOR_LOOP jointime+=usecond(); } - void HaloGather(const Lattice &source,compressor &compress) + template + void HaloGatherDir(const Lattice &source,compressor &compress,int point) { - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - assert (comm_buf.size() == _unified_buffer_size ); - u_comm_offset=0; - - // Gather all comms buffers - for(int point = 0 ; point < _npoints; point++) { - - compress.Point(point); - int dimension = _directions[point]; int displacement = _distances[point]; @@ -662,12 +653,29 @@ PARALLEL_FOR_LOOP } } } + } + + template + void HaloGather(const Lattice &source,compressor &compress) + { + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + assert (comm_buf.size() == _unified_buffer_size ); + u_comm_offset=0; + + // Gather all comms buffers + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point); } assert(u_comm_offset==_unified_buffer_size); halogtime+=usecond(); } + template void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress) { typedef typename cobj::vector_type vector_type; @@ -728,6 +736,7 @@ PARALLEL_FOR_LOOP } + template void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress) { const int Nsimd = _grid->Nsimd(); diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h index 740d6163..43164a1c 100644 --- a/lib/algorithms/CoarsenedMatrix.h +++ b/lib/algorithms/CoarsenedMatrix.h @@ -200,7 +200,7 @@ namespace Grid { //////////////////// Geometry geom; GridBase * _grid; - CartesianStencil > Stencil; + CartesianStencil Stencil; std::vector A; diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 70febc2e..c2c95703 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -35,7 +35,7 @@ class SimpleCompressor { public: void Point(int) {}; - vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) { + vobj operator() (const vobj &arg) { return arg; } }; @@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2 for(int b=0;b_slice_stride[dimension]; int bo = n*rhs._grid->_slice_block[dimension]; - buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + buffer[off+bo+b]=compress(rhs._odata[so+o+b]); } } } else { @@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2 int o = n*rhs._grid->_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb &cbmask ) { - buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + buffer[off+bo++]=compress(rhs._odata[so+o+b]); } } } @@ -97,16 +97,17 @@ Gather_plane_extract(const Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; - + int n1=rhs._grid->_slice_stride[dimension]; + int n2=rhs._grid->_slice_block[dimension]; if ( cbmask ==0x3){ PARALLEL_NESTED_LOOP2 for(int n=0;n_slice_stride[dimension]; - int offset = b+n*rhs._grid->_slice_block[dimension]; + int o = n*n1; + int offset = b+n*n2; - cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + cobj temp =compress(rhs._odata[so+o+b]); extract(temp,pointers,offset); } @@ -121,7 +122,7 @@ PARALLEL_NESTED_LOOP2 int offset = b+n*rhs._grid->_slice_block[dimension]; if ( ocb & cbmask ) { - cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + cobj temp =compress(rhs._odata[so+o+b]); extract(temp,pointers,offset); } } diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 05994a39..3ecb87f7 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -130,7 +130,7 @@ namespace Grid { typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; - typedef CartesianStencil StencilImpl; + typedef WilsonStencil StencilImpl; ImplParams Params; @@ -205,7 +205,7 @@ PARALLEL_FOR_LOOP typedef Lattice DoubledGaugeField; typedef WilsonCompressor Compressor; - typedef CartesianStencil StencilImpl; + typedef WilsonStencil StencilImpl; typedef GparityWilsonImplParams ImplParams; diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index c97f7d98..b4296a01 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -48,12 +48,7 @@ namespace QCD { mu=p; }; - virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) { - return spinproject(in); - } - - SiteHalfSpinor spinproject(const SiteSpinor &in) - { + inline SiteHalfSpinor operator () (const SiteSpinor &in) { SiteHalfSpinor ret; int mudag=mu; if (!dag) { @@ -92,6 +87,173 @@ namespace QCD { } }; + ///////////////////////// + // optimised versions + ///////////////////////// + + template + class WilsonXpCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjXp(ret,in); + return ret; + } + }; + template + class WilsonYpCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjYp(ret,in); + return ret; + } + }; + template + class WilsonZpCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjZp(ret,in); + return ret; + } + }; + template + class WilsonTpCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjTp(ret,in); + return ret; + } + }; + + template + class WilsonXmCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjXm(ret,in); + return ret; + } + }; + template + class WilsonYmCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjYm(ret,in); + return ret; + } + }; + template + class WilsonZmCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjZm(ret,in); + return ret; + } + }; + template + class WilsonTmCompressor { + public: + inline SiteHalfSpinor operator () (const SiteSpinor &in) { + SiteHalfSpinor ret; + spProjTm(ret,in); + return ret; + } + }; + + // Fast comms buffer manipulation which should inline right through (avoid direction + // dependent logic that prevents inlining + template + class WilsonStencil : public CartesianStencil { + public: + + WilsonStencil(GridBase *grid, + int npoints, + int checkerboard, + const std::vector &directions, + const std::vector &distances) : CartesianStencil (grid,npoints,checkerboard,directions,distances) + { }; + + template < class compressor> + std::thread HaloExchangeOptBegin(const Lattice &source,compressor &compress) { + this->Mergers.resize(0); + this->Packets.resize(0); + this->HaloGatherOpt(source,compress); + return std::thread([&] { this->Communicate(); }); + } + + template < class compressor> + void HaloExchangeOpt(const Lattice &source,compressor &compress) + { + auto thr = this->HaloExchangeOptBegin(source,compress); + this->HaloExchangeOptComplete(thr); + } + + void HaloExchangeOptComplete(std::thread &thr) + { + this->CommsMerge(); // spins + this->jointime-=usecond(); + thr.join(); + this->jointime+=usecond(); + } + + template < class compressor> + void HaloGatherOpt(const Lattice &source,compressor &compress) + { + // conformable(source._grid,_grid); + assert(source._grid==this->_grid); + this->halogtime-=usecond(); + + assert (this->comm_buf.size() == this->_unified_buffer_size ); + this->u_comm_offset=0; + + int dag = compress.dag; + static std::vector dirs(8); + for(int mu=0;mu<4;mu++){ + if ( dag ) { + dirs[mu] =mu; + dirs[mu+4]=mu+4; + } else { + dirs[mu] =mu+4; + dirs[mu+4]=mu; + } + } + + + WilsonXpCompressor XpCompress; + this->HaloGatherDir(source,XpCompress,dirs[0]); + + WilsonYpCompressor YpCompress; + this->HaloGatherDir(source,YpCompress,dirs[1]); + + WilsonZpCompressor ZpCompress; + this->HaloGatherDir(source,ZpCompress,dirs[2]); + + WilsonTpCompressor TpCompress; + this->HaloGatherDir(source,TpCompress,dirs[3]); + + WilsonXmCompressor XmCompress; + this->HaloGatherDir(source,XmCompress,dirs[4]); + + WilsonYmCompressor YmCompress; + this->HaloGatherDir(source,YmCompress,dirs[5]); + + WilsonZmCompressor ZmCompress; + this->HaloGatherDir(source,ZmCompress,dirs[6]); + + WilsonTmCompressor TmCompress; + this->HaloGatherDir(source,TmCompress,dirs[7]); + + assert(this->u_comm_offset==this->_unified_buffer_size); + this->halogtime+=usecond(); + } + + }; + }} // namespace close #endif diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 12bd58d5..bbb96629 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -304,8 +304,8 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes int nwork = U._grid->oSites(); commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - st.HaloExchangeComplete(handle); + auto handle = st.HaloExchangeOptBegin(in,compressor); + st.HaloExchangeOptComplete(handle); commtime +=usecond(); jointime -=usecond(); @@ -440,7 +440,7 @@ void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, Le int nwork = U._grid->oSites(); commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); + auto handle = st.HaloExchangeOptBegin(in,compressor); commtime +=usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion @@ -498,7 +498,7 @@ PARALLEL_FOR_LOOP dslashtime +=usecond(); jointime -=usecond(); - st.HaloExchangeComplete(handle); + st.HaloExchangeOptComplete(handle); jointime +=usecond(); local = false; diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h index 4eb72254..93318d9c 100644 --- a/lib/tensors/Tensor_extract_merge.h +++ b/lib/tensors/Tensor_extract_merge.h @@ -44,8 +44,8 @@ template inline void extract(typename std::enable_if::value, const vsimd >::type * y, std::vector &extracted,int offset){ // FIXME: bounce off memory is painful + static const int Nsimd=vsimd::Nsimd(); int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); int s=Nsimd/Nextr; scalar*buf = (scalar *)y; @@ -59,8 +59,8 @@ inline void extract(typename std::enable_if::value, const v template inline void merge(typename std::enable_if::value, vsimd >::type * y, std::vector &extracted,int offset){ + static const int Nsimd=vsimd::Nsimd(); int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it // replicate n-fold. Use to allow Integer masks to // predicate floating point of various width assignments and maintain conformable. @@ -85,6 +85,7 @@ inline void extract(typename std::enable_if::value, const v scalar *buf = (scalar *)&y; for(int i=0;i::value, const v } assert(buf[i*s]==buf[i*s+ii]); } +#endif } }; @@ -106,7 +108,7 @@ inline void extract(typename std::enable_if::value, const v template inline void merge(typename std::enable_if::value, vsimd >::type &y,std::vector &extracted){ int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); + static const int Nsimd=vsimd::Nsimd(); int s=Nsimd/Nextr; scalar *buf = (scalar *)&y; @@ -125,9 +127,9 @@ template inline void extract(const vobj &vec,std::vector pointers(Nextr); @@ -148,8 +150,8 @@ void extract(const vobj &vec,std::vector &extrac typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - const int words=sizeof(vobj)/sizeof(vector_type); - const int Nsimd=vobj::vector_type::Nsimd(); + static const int words=sizeof(vobj)/sizeof(vector_type); + static const int Nsimd=vobj::vector_type::Nsimd(); int Nextr=extracted.size(); int s = Nsimd/Nextr; @@ -172,8 +174,8 @@ void merge(vobj &vec,std::vector &extracted) typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - const int Nsimd=vobj::vector_type::Nsimd(); - const int words=sizeof(vobj)/sizeof(vector_type); + static const int Nsimd=vobj::vector_type::Nsimd(); + static const int words=sizeof(vobj)/sizeof(vector_type); int Nextr = extracted.size(); int splat=Nsimd/Nextr; @@ -224,8 +226,8 @@ void merge1(vobj &vec,std::vector &extracted,int typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::vector_type vector_type ; - const int Nsimd=vobj::vector_type::Nsimd(); - const int words=sizeof(vobj)/sizeof(vector_type); + static const int Nsimd=vobj::vector_type::Nsimd(); + static const int words=sizeof(vobj)/sizeof(vector_type); scalar_type *vp = (scalar_type *)&vec;