diff --git a/TODO b/TODO index a2b7bf03..672879cd 100644 --- a/TODO +++ b/TODO @@ -2,21 +2,20 @@ TODO: --------------- Peter's work list: -1)- Half-precision comms <-- started -- SIMD is prepared 2)- Precision conversion and sort out localConvert <-- - 3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 4)- Binary I/O speed up & x-strips - -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet -- Physical propagator interface -- Conserved currents -- GaugeFix into central location - -- Multigrid Wilson and DWF, compare to other Multigrid implementations -- HDCR resume Recent DONE +-- Cut down the exterior overhead <-- DONE +-- Interior legs from SHM comms <-- DONE +-- Half-precision comms <-- DONE -- Merge high precision reduction into develop -- multiRHS DWF; benchmark on Cori/BNL for comms elimination -- slice* linalg routines for multiRHS, BlockCG diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 686d00a1..7009980c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -152,9 +152,6 @@ int main (int argc, char ** argv) RealD NP = UGrid->_Nprocessors; - std::cout << GridLogMessage << "Creating action operator " << std::endl; - DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters(); Dw.Dhop(src,result,0); + std::cout<Barrier(); + DwH.ZeroCounters(); + DwH.Dhop(src,result,0); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu namespace Grid { -template -class SimpleCompressor { -public: - void Point(int) {}; - - vobj operator() (const vobj &arg) { - return arg; - } -}; - /////////////////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split with compression +// Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// -template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) +template void +Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs._grid->_rdimensions[dimension]; @@ -62,7 +52,7 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen for(int b=0;b &rhs,commVector &buffer,int dimen } } parallel_for(int i=0;i void -Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) +template void +Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; @@ -109,8 +98,8 @@ Gather_plane_extract(const Lattice &rhs,std::vector(temp,pointers,offset); + vobj temp =rhs._odata[so+o+b]; + extract(temp,pointers,offset); } } @@ -127,32 +116,14 @@ Gather_plane_extract(const Lattice &rhs,std::vector(temp,pointers,offset); + vobj temp =rhs._odata[so+o+b]; + extract(temp,pointers,offset); } } } } } -////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split -////////////////////////////////////////////////////// -template void Gather_plane_simple (const Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) -{ - SimpleCompressor dontcompress; - Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); -} - -////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split -////////////////////////////////////////////////////// -template void Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) -{ - SimpleCompressor dontcompress; - Gather_plane_extract(rhs,pointers,dimension,plane,cbmask,dontcompress); -} - ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// @@ -200,7 +171,7 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) +template void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h index b2a44961..a66b49bf 100644 --- a/lib/cshift/Cshift_mpi.h +++ b/lib/cshift/Cshift_mpi.h @@ -154,13 +154,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); grid->Barrier(); - /* - for(int i=0;i void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc index 1f7d4903..dd6ec7bf 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -237,6 +237,13 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & INSTANTIATE_DPERP(GparityWilsonImplD); INSTANTIATE_DPERP(ZWilsonImplF); INSTANTIATE_DPERP(ZWilsonImplD); + + INSTANTIATE_DPERP(WilsonImplFH); + INSTANTIATE_DPERP(WilsonImplDF); + INSTANTIATE_DPERP(GparityWilsonImplFH); + INSTANTIATE_DPERP(GparityWilsonImplDF); + INSTANTIATE_DPERP(ZWilsonImplFH); + INSTANTIATE_DPERP(ZWilsonImplDF); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc index 16fb47bb..6b53d540 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc @@ -137,6 +137,20 @@ template void CayleyFermion5D::MooeeInternal(const FermionField &ps template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + +INSTANTIATE_DPERP(GparityWilsonImplFH); +INSTANTIATE_DPERP(GparityWilsonImplDF); +INSTANTIATE_DPERP(WilsonImplFH); +INSTANTIATE_DPERP(WilsonImplDF); +INSTANTIATE_DPERP(ZWilsonImplFH); +INSTANTIATE_DPERP(ZWilsonImplDF); + +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc index a83b962e..cb9b2957 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc @@ -37,7 +37,6 @@ namespace Grid { namespace QCD { // FIXME -- make a version of these routines with site loop outermost for cache reuse. - // Pminus fowards // Pplus backwards template @@ -152,6 +151,13 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & INSTANTIATE_DPERP(GparityWilsonImplD); INSTANTIATE_DPERP(ZWilsonImplF); INSTANTIATE_DPERP(ZWilsonImplD); + + INSTANTIATE_DPERP(WilsonImplFH); + INSTANTIATE_DPERP(WilsonImplDF); + INSTANTIATE_DPERP(GparityWilsonImplFH); + INSTANTIATE_DPERP(GparityWilsonImplDF); + INSTANTIATE_DPERP(ZWilsonImplFH); + INSTANTIATE_DPERP(ZWilsonImplDF); #endif } diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 566624e3..653e6ab3 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -808,10 +808,21 @@ INSTANTIATE_DPERP(DomainWallVec5dImplF); INSTANTIATE_DPERP(ZDomainWallVec5dImplD); INSTANTIATE_DPERP(ZDomainWallVec5dImplF); +INSTANTIATE_DPERP(DomainWallVec5dImplDF); +INSTANTIATE_DPERP(DomainWallVec5dImplFH); +INSTANTIATE_DPERP(ZDomainWallVec5dImplDF); +INSTANTIATE_DPERP(ZDomainWallVec5dImplFH); + template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + + }} diff --git a/lib/qcd/action/fermion/Fermion.h b/lib/qcd/action/fermion/Fermion.h index ac42f177..94fbae6c 100644 --- a/lib/qcd/action/fermion/Fermion.h +++ b/lib/qcd/action/fermion/Fermion.h @@ -89,6 +89,10 @@ typedef WilsonFermion WilsonFermionR; typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; +typedef WilsonFermion WilsonFermionRL; +typedef WilsonFermion WilsonFermionFH; +typedef WilsonFermion WilsonFermionDF; + typedef WilsonFermion WilsonAdjFermionR; typedef WilsonFermion WilsonAdjFermionF; typedef WilsonFermion WilsonAdjFermionD; @@ -105,27 +109,50 @@ typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; +typedef DomainWallFermion DomainWallFermionRL; +typedef DomainWallFermion DomainWallFermionFH; +typedef DomainWallFermion DomainWallFermionDF; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; +typedef MobiusFermion MobiusFermionRL; +typedef MobiusFermion MobiusFermionFH; +typedef MobiusFermion MobiusFermionDF; + typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +typedef ZMobiusFermion ZMobiusFermionRL; +typedef ZMobiusFermion ZMobiusFermionFH; +typedef ZMobiusFermion ZMobiusFermionDF; + // Ls vectorised typedef DomainWallFermion DomainWallFermionVec5dR; typedef DomainWallFermion DomainWallFermionVec5dF; typedef DomainWallFermion DomainWallFermionVec5dD; +typedef DomainWallFermion DomainWallFermionVec5dRL; +typedef DomainWallFermion DomainWallFermionVec5dFH; +typedef DomainWallFermion DomainWallFermionVec5dDF; + typedef MobiusFermion MobiusFermionVec5dR; typedef MobiusFermion MobiusFermionVec5dF; typedef MobiusFermion MobiusFermionVec5dD; +typedef MobiusFermion MobiusFermionVec5dRL; +typedef MobiusFermion MobiusFermionVec5dFH; +typedef MobiusFermion MobiusFermionVec5dDF; + typedef ZMobiusFermion ZMobiusFermionVec5dR; typedef ZMobiusFermion ZMobiusFermionVec5dF; typedef ZMobiusFermion ZMobiusFermionVec5dD; +typedef ZMobiusFermion ZMobiusFermionVec5dRL; +typedef ZMobiusFermion ZMobiusFermionVec5dFH; +typedef ZMobiusFermion ZMobiusFermionVec5dDF; typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; @@ -166,17 +193,35 @@ typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonP typedef WilsonFermion GparityWilsonFermionR; typedef WilsonFermion GparityWilsonFermionF; typedef WilsonFermion GparityWilsonFermionD; + +typedef WilsonFermion GparityWilsonFermionRL; +typedef WilsonFermion GparityWilsonFermionFH; +typedef WilsonFermion GparityWilsonFermionDF; + typedef DomainWallFermion GparityDomainWallFermionR; typedef DomainWallFermion GparityDomainWallFermionF; typedef DomainWallFermion GparityDomainWallFermionD; +typedef DomainWallFermion GparityDomainWallFermionRL; +typedef DomainWallFermion GparityDomainWallFermionFH; +typedef DomainWallFermion GparityDomainWallFermionDF; + typedef WilsonTMFermion GparityWilsonTMFermionR; typedef WilsonTMFermion GparityWilsonTMFermionF; typedef WilsonTMFermion GparityWilsonTMFermionD; + +typedef WilsonTMFermion GparityWilsonTMFermionRL; +typedef WilsonTMFermion GparityWilsonTMFermionFH; +typedef WilsonTMFermion GparityWilsonTMFermionDF; + typedef MobiusFermion GparityMobiusFermionR; typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; +typedef MobiusFermion GparityMobiusFermionRL; +typedef MobiusFermion GparityMobiusFermionFH; +typedef MobiusFermion GparityMobiusFermionDF; + typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; diff --git a/lib/qcd/action/fermion/FermionCore.h b/lib/qcd/action/fermion/FermionCore.h index 74d94d67..17006961 100644 --- a/lib/qcd/action/fermion/FermionCore.h +++ b/lib/qcd/action/fermion/FermionCore.h @@ -55,7 +55,14 @@ Author: Peter Boyle template class A; \ template class A; \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; + #define AdjointFermOpTemplateInstantiate(A) \ template class A; \ @@ -69,7 +76,11 @@ Author: Peter Boyle template class A; \ template class A; \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; #define FermOpTemplateInstantiate(A) \ FermOp4dVecTemplateInstantiate(A) \ diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 141d808d..f6b47063 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -35,7 +35,6 @@ directory namespace Grid { namespace QCD { - ////////////////////////////////////////////// // Template parameter class constructs to package // externally control Fermion implementations @@ -89,7 +88,53 @@ namespace QCD { // // } ////////////////////////////////////////////// - + + template struct SamePrecisionMapper { + typedef T HigherPrecVector ; + typedef T LowerPrecVector ; + }; + template struct LowerPrecisionMapper { }; + template <> struct LowerPrecisionMapper { + typedef vRealF HigherPrecVector ; + typedef vRealH LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vRealD HigherPrecVector ; + typedef vRealF LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vComplexF HigherPrecVector ; + typedef vComplexH LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vComplexD HigherPrecVector ; + typedef vComplexF LowerPrecVector ; + }; + + struct CoeffReal { + public: + typedef RealD _Coeff_t; + static const int Nhcs = 2; + template using PrecisionMapper = SamePrecisionMapper; + }; + struct CoeffRealHalfComms { + public: + typedef RealD _Coeff_t; + static const int Nhcs = 1; + template using PrecisionMapper = LowerPrecisionMapper; + }; + struct CoeffComplex { + public: + typedef ComplexD _Coeff_t; + static const int Nhcs = 2; + template using PrecisionMapper = SamePrecisionMapper; + }; + struct CoeffComplexHalfComms { + public: + typedef ComplexD _Coeff_t; + static const int Nhcs = 1; + template using PrecisionMapper = LowerPrecisionMapper; + }; //////////////////////////////////////////////////////////////////////// // Implementation dependent fermion types @@ -114,37 +159,40 @@ namespace QCD { ///////////////////////////////////////////////////////////////////////////// // Single flavour four spinors with colour index ///////////////////////////////////////////////////////////////////////////// - template + template class WilsonImpl : public PeriodicGaugeImpl > { - public: static const int Dimension = Representation::Dimension; + static const bool LsVectorised=false; + static const int Nhcs = Options::Nhcs; + typedef PeriodicGaugeImpl > Gimpl; + INHERIT_GIMPL_TYPES(Gimpl); //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - const bool LsVectorised=false; - typedef _Coeff_t Coeff_t; - - INHERIT_GIMPL_TYPES(Gimpl); + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; template using iImplSpinor = iScalar, Ns> >; template using iImplPropagator = iScalar, Ns> >; template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplHalfCommSpinor = iScalar, Nhcs> >; template using iImplDoubledGaugeField = iVector >, Nds>; typedef iImplSpinor SiteSpinor; typedef iImplPropagator SitePropagator; typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice FermionField; typedef Lattice PropagatorField; typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; @@ -209,31 +257,34 @@ namespace QCD { //////////////////////////////////////////////////////////////////////////////////// // Single flavour four spinors with colour index, 5d redblack //////////////////////////////////////////////////////////////////////////////////// - -template +template class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { public: - - static const int Dimension = Nrepresentation; - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; + typedef PeriodicGaugeImpl > Gimpl; - INHERIT_GIMPL_TYPES(Gimpl); + + static const int Dimension = Nrepresentation; + static const bool LsVectorised=true; + static const int Nhcs = Options::Nhcs; + + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; template using iImplSpinor = iScalar, Ns> >; template using iImplPropagator = iScalar, Ns> >; template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplHalfCommSpinor = iScalar, Nhcs> >; template using iImplDoubledGaugeField = iVector >, Nds>; template using iImplGaugeField = iVector >, Nd>; template using iImplGaugeLink = iScalar > >; - typedef iImplSpinor SiteSpinor; - typedef iImplPropagator SitePropagator; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice FermionField; - typedef Lattice PropagatorField; - + typedef iImplSpinor SiteSpinor; + typedef iImplPropagator SitePropagator; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; + typedef Lattice FermionField; + typedef Lattice PropagatorField; ///////////////////////////////////////////////// // Make the doubled gauge field a *scalar* @@ -241,9 +292,9 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar typedef iImplGaugeField SiteScalarGaugeField; // scalar typedef iImplGaugeLink SiteScalarGaugeLink; // scalar - typedef Lattice DoubledGaugeField; + typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; @@ -311,35 +362,37 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - -template +template class GparityWilsonImpl : public ConjugateGaugeImpl > { public: static const int Dimension = Nrepresentation; + static const int Nhcs = Options::Nhcs; + static const bool LsVectorised=false; - const bool LsVectorised=false; - - typedef _Coeff_t Coeff_t; typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; - INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; - template using iImplSpinor = iVector, Ns>, Ngp>; - template using iImplPropagator = iVector, Ns>, Ngp >; - template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplSpinor = iVector, Ns>, Ngp>; + template using iImplPropagator = iVector, Ns>, Ngp>; + template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplHalfCommSpinor = iVector, Nhcs>, Ngp>; template using iImplDoubledGaugeField = iVector >, Nds>, Ngp>; - - typedef iImplSpinor SiteSpinor; - typedef iImplPropagator SitePropagator; - typedef iImplHalfSpinor SiteHalfSpinor; + + typedef iImplSpinor SiteSpinor; + typedef iImplPropagator SitePropagator; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice FermionField; typedef Lattice PropagatorField; typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonStencil StencilImpl; typedef GparityWilsonImplParams ImplParams; @@ -356,8 +409,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl - class StaggeredImpl : public PeriodicGaugeImpl > { +///////////////////////////////////////////////////////////////////////////// +// Single flavour one component spinors with colour index +///////////////////////////////////////////////////////////////////////////// +template +class StaggeredImpl : public PeriodicGaugeImpl > { public: typedef RealD _Coeff_t ; static const int Dimension = Representation::Dimension; + static const bool LsVectorised=false; typedef PeriodicGaugeImpl > Gimpl; //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - const bool LsVectorised=false; typedef _Coeff_t Coeff_t; INHERIT_GIMPL_TYPES(Gimpl); @@ -641,8 +692,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl > Gimpl; //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; INHERIT_GIMPL_TYPES(Gimpl); @@ -823,43 +870,61 @@ class GparityWilsonImpl : public ConjugateGaugeImpl WilsonImplR; // Real.. whichever prec +typedef WilsonImpl WilsonImplF; // Float +typedef WilsonImpl WilsonImplD; // Double +typedef WilsonImpl WilsonImplRL; // Real.. whichever prec +typedef WilsonImpl WilsonImplFH; // Float +typedef WilsonImpl WilsonImplDF; // Double - typedef WilsonImpl WilsonImplR; // Real.. whichever prec - typedef WilsonImpl WilsonImplF; // Float - typedef WilsonImpl WilsonImplD; // Double +typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplF; // Float +typedef WilsonImpl ZWilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec - typedef WilsonImpl ZWilsonImplF; // Float - typedef WilsonImpl ZWilsonImplD; // Double +typedef WilsonImpl ZWilsonImplRL; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplFH; // Float +typedef WilsonImpl ZWilsonImplDF; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec - typedef WilsonImpl WilsonAdjImplF; // Float - typedef WilsonImpl WilsonAdjImplD; // Double +typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec +typedef WilsonImpl WilsonAdjImplF; // Float +typedef WilsonImpl WilsonAdjImplD; // Double - typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec - typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float - typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double +typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec +typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float +typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double - typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float +typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplRL; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplFH; // Float +typedef DomainWallVec5dImpl DomainWallVec5dImplDF; // Double - typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec - typedef GparityWilsonImpl GparityWilsonImplF; // Float - typedef GparityWilsonImpl GparityWilsonImplD; // Double +typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float +typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + +typedef DomainWallVec5dImpl ZDomainWallVec5dImplRL; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplFH; // Float +typedef DomainWallVec5dImpl ZDomainWallVec5dImplDF; // Double + +typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplF; // Float +typedef GparityWilsonImpl GparityWilsonImplD; // Double + +typedef GparityWilsonImpl GparityWilsonImplRL; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplFH; // Float +typedef GparityWilsonImpl GparityWilsonImplDF; // Double - typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec - typedef StaggeredImpl StaggeredImplF; // Float - typedef StaggeredImpl StaggeredImplD; // Double +typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec +typedef StaggeredImpl StaggeredImplF; // Float +typedef StaggeredImpl StaggeredImplD; // Double - typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec - typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float - typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double +typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec +typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float +typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double }} diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index c3b4dffb..96cbe1ec 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -33,228 +33,321 @@ Author: paboyle namespace Grid { namespace QCD { - template - class WilsonCompressor { - public: - int mu; - int dag; +///////////////////////////////////////////////////////////////////////////////////////////// +// optimised versions supporting half precision too +///////////////////////////////////////////////////////////////////////////////////////////// - WilsonCompressor(int _dag){ - mu=0; - dag=_dag; - assert((dag==0)||(dag==1)); - } - void Point(int p) { - mu=p; - }; +template +class WilsonCompressorTemplate; - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - int mudag=mu; - if (!dag) { - mudag=(mu+Nd)%(2*Nd); - } - switch(mudag) { - case Xp: - spProjXp(ret,in); - break; - case Yp: - spProjYp(ret,in); - break; - case Zp: - spProjZp(ret,in); - break; - case Tp: - spProjTp(ret,in); - break; - case Xm: - spProjXm(ret,in); - break; - case Ym: - spProjYm(ret,in); - break; - case Zm: - spProjZm(ret,in); - break; - case Tm: - spProjTm(ret,in); - break; - default: - assert(0); - break; - } - return ret; - } - }; - ///////////////////////// - // optimised versions - ///////////////////////// +template +class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, + typename std::enable_if::value>::type > +{ + public: + + int mu,dag; - template - class WilsonXpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjXp(ret,in); - return ret; - } - }; - template - class WilsonYpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjYp(ret,in); - return ret; - } - }; - template - class WilsonZpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjZp(ret,in); - return ret; - } - }; - template - class WilsonTpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjTp(ret,in); - return ret; - } - }; + void Point(int p) { mu=p; }; - template - class WilsonXmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjXm(ret,in); - return ret; - } - }; - template - class WilsonYmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjYm(ret,in); - return ret; - } - }; - template - class WilsonZmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjZm(ret,in); - return ret; - } - }; - template - class WilsonTmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjTm(ret,in); - return ret; - } - }; + WilsonCompressorTemplate(int _dag=0){ + dag = _dag; + } - // Fast comms buffer manipulation which should inline right through (avoid direction - // dependent logic that prevents inlining - template - class WilsonStencil : public CartesianStencil { - public: + typedef _Spinor SiteSpinor; + typedef _Hspinor SiteHalfSpinor; + typedef _HCspinor SiteHalfCommSpinor; + typedef typename SiteHalfCommSpinor::vector_type vComplexLow; + typedef typename SiteHalfSpinor::vector_type vComplexHigh; + constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + inline int CommDatumSize(void) { + return sizeof(SiteHalfCommSpinor); + } - WilsonStencil(GridBase *grid, + /*****************************************************/ + /* Compress includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { + projector::Proj(buf[o],in,mu,dag); + } + + /*****************************************************/ + /* Exchange includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Exchange(SiteHalfSpinor *mp, + SiteHalfSpinor *vp0, + SiteHalfSpinor *vp1, + Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + + /*****************************************************/ + /* Have a decompression step if mpi data is not same */ + /*****************************************************/ + inline void Decompress(SiteHalfSpinor *out, + SiteHalfSpinor *in, Integer o) { + assert(0); + } + + /*****************************************************/ + /* Compress Exchange */ + /*****************************************************/ + inline void CompressExchange(SiteHalfSpinor *out0, + SiteHalfSpinor *out1, + const SiteSpinor *in, + Integer j,Integer k, Integer m,Integer type){ + SiteHalfSpinor temp1, temp2,temp3,temp4; + projector::Proj(temp1,in[k],mu,dag); + projector::Proj(temp2,in[m],mu,dag); + exchange(out0[j],out1[j],temp1,temp2,type); + } + + /*****************************************************/ + /* Pass the info to the stencil */ + /*****************************************************/ + inline bool DecompressionStep(void) { return false; } + +}; + +template +class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, + typename std::enable_if::value>::type > +{ + public: + + int mu,dag; + + void Point(int p) { mu=p; }; + + WilsonCompressorTemplate(int _dag=0){ + dag = _dag; + } + + typedef _Spinor SiteSpinor; + typedef _Hspinor SiteHalfSpinor; + typedef _HCspinor SiteHalfCommSpinor; + typedef typename SiteHalfCommSpinor::vector_type vComplexLow; + typedef typename SiteHalfSpinor::vector_type vComplexHigh; + constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); + + inline int CommDatumSize(void) { + return sizeof(SiteHalfCommSpinor); + } + + /*****************************************************/ + /* Compress includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { + SiteHalfSpinor hsp; + SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; + projector::Proj(hsp,in,mu,dag); + precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); + } + + /*****************************************************/ + /* Exchange includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Exchange(SiteHalfSpinor *mp, + SiteHalfSpinor *vp0, + SiteHalfSpinor *vp1, + Integer type,Integer o){ + SiteHalfSpinor vt0,vt1; + SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; + SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; + precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw); + precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw); + exchange(mp[2*o],mp[2*o+1],vt0,vt1,type); + } + + /*****************************************************/ + /* Have a decompression step if mpi data is not same */ + /*****************************************************/ + inline void Decompress(SiteHalfSpinor *out, + SiteHalfSpinor *in, Integer o){ + SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; + precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); + } + + /*****************************************************/ + /* Compress Exchange */ + /*****************************************************/ + inline void CompressExchange(SiteHalfSpinor *out0, + SiteHalfSpinor *out1, + const SiteSpinor *in, + Integer j,Integer k, Integer m,Integer type){ + SiteHalfSpinor temp1, temp2,temp3,temp4; + SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; + SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; + projector::Proj(temp1,in[k],mu,dag); + projector::Proj(temp2,in[m],mu,dag); + exchange(temp3,temp4,temp1,temp2,type); + precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw); + precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw); + } + + /*****************************************************/ + /* Pass the info to the stencil */ + /*****************************************************/ + inline bool DecompressionStep(void) { return true; } + +}; + +#define DECLARE_PROJ(Projector,Compressor,spProj) \ + class Projector { \ + public: \ + template \ + static void Proj(hsp &result,const fsp &in,int mu,int dag){ \ + spProj(result,in); \ + } \ + }; \ +template using Compressor = WilsonCompressorTemplate; + +DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp); +DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp); +DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp); +DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp); +DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm); +DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm); +DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm); +DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm); + +class WilsonProjector { + public: + template + static void Proj(hsp &result,const fsp &in,int mu,int dag){ + int mudag=dag? mu : (mu+Nd)%(2*Nd); + switch(mudag) { + case Xp: spProjXp(result,in); break; + case Yp: spProjYp(result,in); break; + case Zp: spProjZp(result,in); break; + case Tp: spProjTp(result,in); break; + case Xm: spProjXm(result,in); break; + case Ym: spProjYm(result,in); break; + case Zm: spProjZm(result,in); break; + case Tm: spProjTm(result,in); break; + default: assert(0); break; + } + } +}; +template using WilsonCompressor = WilsonCompressorTemplate; + +// Fast comms buffer manipulation which should inline right through (avoid direction +// dependent logic that prevents inlining +template +class WilsonStencil : public CartesianStencil { +public: + + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + + std::vector same_node; + std::vector surface_list; + + WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, - const std::vector &distances) : CartesianStencil (grid,npoints,checkerboard,directions,distances) - { }; - - template < class compressor> - void HaloExchangeOpt(const Lattice &source,compressor &compress) - { - std::vector > reqs; - HaloExchangeOptGather(source,compress); - this->CommunicateBegin(reqs); - this->calls++; - this->CommunicateComplete(reqs); - this->CommsMerge(); - } - - template < class compressor> - void HaloExchangeOptGather(const Lattice &source,compressor &compress) - { - this->calls++; - this->Mergers.resize(0); - this->Packets.resize(0); - this->HaloGatherOpt(source,compress); - } - - - template < class compressor> - void HaloGatherOpt(const Lattice &source,compressor &compress) - { - this->_grid->StencilBarrier(); - // conformable(source._grid,_grid); - assert(source._grid==this->_grid); - this->halogtime-=usecond(); - - this->u_comm_offset=0; - - int dag = compress.dag; - - WilsonXpCompressor XpCompress; - WilsonYpCompressor YpCompress; - WilsonZpCompressor ZpCompress; - WilsonTpCompressor TpCompress; - WilsonXmCompressor XmCompress; - WilsonYmCompressor YmCompress; - WilsonZmCompressor ZmCompress; - WilsonTmCompressor TmCompress; - - // Gather all comms buffers - // for(int point = 0 ; point < _npoints; point++) { - // compress.Point(point); - // HaloGatherDir(source,compress,point,face_idx); - // } - int face_idx=0; - if ( dag ) { - // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); - this->HaloGatherDir(source,YpCompress,Yp,face_idx); - this->HaloGatherDir(source,ZpCompress,Zp,face_idx); - this->HaloGatherDir(source,TpCompress,Tp,face_idx); - this->HaloGatherDir(source,XmCompress,Xm,face_idx); - this->HaloGatherDir(source,YmCompress,Ym,face_idx); - this->HaloGatherDir(source,ZmCompress,Zm,face_idx); - this->HaloGatherDir(source,TmCompress,Tm,face_idx); - } else { - this->HaloGatherDir(source,XmCompress,Xp,face_idx); - this->HaloGatherDir(source,YmCompress,Yp,face_idx); - this->HaloGatherDir(source,ZmCompress,Zp,face_idx); - this->HaloGatherDir(source,TmCompress,Tp,face_idx); - this->HaloGatherDir(source,XpCompress,Xm,face_idx); - this->HaloGatherDir(source,YpCompress,Ym,face_idx); - this->HaloGatherDir(source,ZpCompress,Zm,face_idx); - this->HaloGatherDir(source,TpCompress,Tm,face_idx); - } - this->face_table_computed=1; - assert(this->u_comm_offset==this->_unified_buffer_size); - this->halogtime+=usecond(); - } - + const std::vector &distances) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) , + same_node(npoints) + { + surface_list.resize(0); }; + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + // std::cout << " dir " <_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + + template < class compressor> + void HaloExchangeOpt(const Lattice &source,compressor &compress) + { + std::vector > reqs; + this->HaloExchangeOptGather(source,compress); + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); + this->CommsMerge(compress); + this->CommsMergeSHM(compress); + } + + template + void HaloExchangeOptGather(const Lattice &source,compressor &compress) + { + this->Prepare(); + this->HaloGatherOpt(source,compress); + } + + template + void HaloGatherOpt(const Lattice &source,compressor &compress) + { + // Strategy. Inherit types from Compressor. + // Use types to select the write direction by directon compressor + typedef typename compressor::SiteSpinor SiteSpinor; + typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; + typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; + + this->_grid->StencilBarrier(); + + assert(source._grid==this->_grid); + this->halogtime-=usecond(); + + this->u_comm_offset=0; + + WilsonXpCompressor XpCompress; + WilsonYpCompressor YpCompress; + WilsonZpCompressor ZpCompress; + WilsonTpCompressor TpCompress; + WilsonXmCompressor XmCompress; + WilsonYmCompressor YmCompress; + WilsonZmCompressor ZmCompress; + WilsonTmCompressor TmCompress; + + int dag = compress.dag; + int face_idx=0; + if ( dag ) { + // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + } else { + assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + } + this->face_table_computed=1; + assert(this->u_comm_offset==this->_unified_buffer_size); + this->halogtime+=usecond(); + } + + }; }} // namespace close #endif diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 88bc425a..fb20108b 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -117,49 +117,20 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // Allocate the required comms buffer ImportGauge(_Umu); + + // Build lists of exterior only nodes + int LLs = FiveDimGrid._rdimensions[0]; + int vol4; + vol4=FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); + + std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() + <<" " << StencilEven.surface_list.size()< -WilsonFermion5D::WilsonFermion5D(int simd,GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - RealD _M5,const ImplParams &p) : -{ - int nsimd = Simd::Nsimd(); - - // some assertions - assert(FiveDimGrid._ndimension==5); - assert(FiveDimRedBlackGrid._ndimension==5); - assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction - assert(FourDimGrid._ndimension==4); - - // Dimension zero of the five-d is the Ls direction - Ls=FiveDimGrid._fdimensions[0]; - assert(FiveDimGrid._processors[0] ==1); - assert(FiveDimGrid._simd_layout[0] ==nsimd); - - assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); - assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); - - // Other dimensions must match the decomposition of the four-D fields - for(int d=0;d<4;d++){ - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - - assert(FourDimGrid._simd_layout[d]=1); - assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); - assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); - } - - { - } -} - */ template void WilsonFermion5D::Report(void) @@ -396,6 +367,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DhopTotalTime+=usecond(); } + template void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, @@ -409,12 +381,17 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg int LLs = in._grid->_rdimensions[0]; int len = U._grid->oSites(); - + DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); DhopFaceTime+=usecond(); std::vector > reqs; + // Rely on async comms; start comms before merge of local data + st.CommunicateBegin(reqs); + st.CommsMergeSHM(compressor); + + // Perhaps use omp task and region #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -426,7 +403,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg if ( me == 0 ) { DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); st.CommunicateComplete(reqs); DhopCommTime+=usecond(); } else { @@ -439,28 +415,37 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } DhopFaceTime-=usecond(); - st.CommsMerge(); + st.CommsMerge(compressor); DhopFaceTime+=usecond(); -#pragma omp parallel - { - int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; - - GridThread::GetWork(len,me,mywork,myoff,nthreads); - int sF = LLs * myoff; - - // Exterior links in stencil - if ( me==0 ) DhopComputeTime2-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - if ( me==0 ) DhopComputeTime2+=usecond(); - }// end parallel region + // Load imbalance alert. Should use dynamic schedule OMP for loop + // Perhaps create a list of only those sites with face work, and + // load balance process the list. + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } + DhopComputeTime2+=usecond(); #else assert(0); #endif + } + + + template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, @@ -679,7 +664,6 @@ void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const Fe } - FermOpTemplateInstantiate(WilsonFermion5D); GparityFermOpTemplateInstantiate(WilsonFermion5D); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 6e72e089..03c066b0 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -33,52 +33,8 @@ directory namespace Grid { namespace QCD { - int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; - int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; - -#ifdef QPX -#include -#include -#include -#include -#endif - -void bgq_l1p_optimisation(int mode) -{ -#ifdef QPX -#undef L1P_CFG_PF_USR -#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ - - uint64_t cfg_pf_usr; - if ( mode ) { - cfg_pf_usr = - L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; - // if ( sizeof(Float) == sizeof(double) ) { - cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; - // } else { - // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; - // } - } else { - cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) - | L1P_CFG_PF_USR_dfetch_max_footprint(2) - | L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_stream_prefetch_enable; - } - *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; - -#endif - -} - +int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; +int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; template WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; @@ -86,12 +42,72 @@ WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// // Generic implementation; move to different file? //////////////////////////////////////////// + +#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); + +#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + } +#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + chi_p = &buf[SE->_offset]; \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ + } + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + if (SE->_is_local && SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else if (SE->_is_local) { \ + spProj(chi, in._odata[SE->_offset]); \ + } else { \ + chi = buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); \ + Recon(result, Uchi); \ + } + + //////////////////////////////////////////////////////////////////// + // All legs kernels ; comms then compute + //////////////////////////////////////////////////////////////////// template void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out, - int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -100,174 +116,22 @@ void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; -// Need controls to do interior, exterior, or both template void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out,int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -276,168 +140,123 @@ void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, Do StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; + //////////////////////////////////////////////////////////////////// + // Interior kernels + //////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + + result=zero; + GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; + +template +void WilsonKernels::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + result=zero; + GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; +//////////////////////////////////////////////////////////////////// +// Exterior kernels +//////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; + +template +void WilsonKernels::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; template void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, @@ -451,119 +270,14 @@ void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal int ptype; SE = st.GetEntry(ptype, dir, sF); - - // Xp - if (gamma == Xp) { - if (SE->_is_local && SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - - // Yp - if (gamma == Yp) { - if (SE->_is_local && SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYp(result, Uchi); - } - - // Zp - if (gamma == Zp) { - if (SE->_is_local && SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZp(result, Uchi); - } - - // Tp - if (gamma == Tp) { - if (SE->_is_local && SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTp(result, Uchi); - } - - // Xm - if (gamma == Xm) { - if (SE->_is_local && SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXm(result, Uchi); - } - - // Ym - if (gamma == Ym) { - if (SE->_is_local && SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYm(result, Uchi); - } - - // Zm - if (gamma == Zm) { - if (SE->_is_local && SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZm(result, Uchi); - } - - // Tm - if (gamma == Tm) { - if (SE->_is_local && SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTm(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); + GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); + GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); + GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); + GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm); + GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm); + GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm); + GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm); vstream(out._odata[sF], result); } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 20ee87f2..2cf52660 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -34,8 +34,6 @@ directory namespace Grid { namespace QCD { -void bgq_l1p_optimisation(int mode); - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Helper routines that implement Wilson stencil for a single site. // Common to both the WilsonFermion and WilsonFermion5D @@ -44,9 +42,8 @@ class WilsonKernelsStatic { public: enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { CommsAndCompute, CommsThenCompute }; - // S-direction is INNERMOST and takes no part in the parity. - static int Opt; // these are a temporary hack - static int Comms; // these are a temporary hack + static int Opt; + static int Comms; }; template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { @@ -66,7 +63,7 @@ public: switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - if(interior&&exterior) WilsonKernels::AsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSite (st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (interior) WilsonKernels::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (exterior) WilsonKernels::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else assert(0); @@ -75,7 +72,9 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -84,7 +83,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -99,11 +101,14 @@ public: template typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { // no kernel choice for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -113,13 +118,13 @@ public: template typename std::enable_if::type DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { - + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) +{ bgq_l1p_optimisation(1); switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - if(interior&&exterior) WilsonKernels::AsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSiteDag (st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (interior) WilsonKernels::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (exterior) WilsonKernels::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else assert(0); @@ -128,7 +133,10 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -137,7 +145,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -156,7 +167,10 @@ public: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -169,36 +183,60 @@ public: private: // Specialised variants void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); + void HandDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void HandDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void HandDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void HandDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + public: WilsonKernels(const ImplParams &p = ImplParams()); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 365be69a..cd5d2430 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -112,5 +112,16 @@ INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplD); +INSTANTIATE_ASM(WilsonImplFH); +INSTANTIATE_ASM(WilsonImplDF); +INSTANTIATE_ASM(ZWilsonImplFH); +INSTANTIATE_ASM(ZWilsonImplDF); +INSTANTIATE_ASM(GparityWilsonImplFH); +INSTANTIATE_ASM(GparityWilsonImplDF); +INSTANTIATE_ASM(DomainWallVec5dImplFH); +INSTANTIATE_ASM(DomainWallVec5dImplDF); +INSTANTIATE_ASM(ZDomainWallVec5dImplFH); +INSTANTIATE_ASM(ZDomainWallVec5dImplDF); + }} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h index 1839e9bc..948c16a2 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h @@ -71,6 +71,16 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doub int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -84,6 +94,16 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -97,6 +117,16 @@ template<> void WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -115,6 +145,16 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -128,6 +168,16 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -141,6 +191,16 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) @@ -162,6 +222,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -174,6 +243,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -189,6 +267,16 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -205,6 +293,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -217,6 +314,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -229,6 +335,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN @@ -269,6 +384,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doub int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -281,6 +405,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -293,6 +426,15 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -309,6 +451,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -321,6 +472,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -333,6 +493,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) @@ -354,6 +523,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -366,6 +544,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -380,6 +567,15 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -396,6 +592,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -408,6 +613,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -420,6 +634,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 34aba472..db8651ab 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -39,24 +39,26 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR_AND_EXTERIOR -#define ZERO_NMU(A) -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) -#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + LOAD64(%r10,isigns); \ + RECON; \ -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - LOAD64(%r10,isigns); \ - PROJMEM(base); \ - MAYBEPERM(PERMUTE_DIR,perm); - -#define EXTERIOR_BLOCK(a,b,RECON) \ - LOAD_CHI(base); - -#define COMMON_BLOCK(a,b,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(a,basep); \ - LOAD64(%r10,isigns); \ - RECON; +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -67,62 +69,62 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR -#define COMMON_BLOCK(a,b,RECON) -#define ZERO_NMU(A) +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + LOAD64(%r10,isigns); \ + RECON; \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ -// No accumulate for DIR0 -#define EXTERIOR_BLOCK_XP(a,b,RECON) \ - ZERO_PSI; \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK(a,b,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) - -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - LOAD64(%r10,isigns); \ - PROJMEM(base); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(a,basep); \ - LOAD64(%r10,isigns); \ - RECON; +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif - //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR -#define ZERO_NMU(A) nmu=0; -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - ZERO_PSI; base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } -#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK(a,b,RECON) \ - nmu++; \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(a,base); \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - LOAD64(%r10,isigns); \ - RECON; - -#define COMMON_BLOCK(a,b,RECON) - -#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif - { int nmu; int local,perm, ptype; @@ -134,11 +136,15 @@ MASK_REGS; int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; int sUn=lo.Reorder(ssn); -#ifndef EXTERIOR LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; #endif for(int s=0;s #define REGISTER #define LOAD_CHIMU \ - const SiteSpinor & ref (in._odata[offset]); \ + {const SiteSpinor & ref (in._odata[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ Chimu_02=ref()(0)(2);\ @@ -43,20 +43,20 @@ Author: paboyle Chimu_22=ref()(2)(2);\ Chimu_30=ref()(3)(0);\ Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2); + Chimu_32=ref()(3)(2);} #define LOAD_CHI\ - const SiteHalfSpinor &ref(buf[offset]); \ + {const SiteHalfSpinor &ref(buf[offset]); \ Chi_00 = ref()(0)(0);\ Chi_01 = ref()(0)(1);\ Chi_02 = ref()(0)(2);\ Chi_10 = ref()(1)(0);\ Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2); + Chi_12 = ref()(1)(2);} // To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ - auto & ref(U._odata[sU](A)); \ + {auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ Impl::loadLinkElement(U_20,ref()(2,0)); \ @@ -83,7 +83,7 @@ Author: paboyle UChi_01+= U_10*Chi_02;\ UChi_11+= U_10*Chi_12;\ UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12; + UChi_12+= U_20*Chi_12;} #define PERMUTE_DIR(dir) \ @@ -307,55 +307,132 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -namespace Grid { -namespace QCD { +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + nmu++; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out._odata[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + } + +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out._odata[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + } -template void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; - REGISTER Simd result_00; // 12 regs on knc - REGISTER Simd result_01; - REGISTER Simd result_02; - - REGISTER Simd result_10; - REGISTER Simd result_11; - REGISTER Simd result_12; - - REGISTER Simd result_20; - REGISTER Simd result_21; - REGISTER Simd result_22; - - REGISTER Simd result_30; - REGISTER Simd result_31; - REGISTER Simd result_32; // 20 left - - REGISTER Simd Chi_00; // two spinor; 6 regs - REGISTER Simd Chi_01; - REGISTER Simd Chi_02; - - REGISTER Simd Chi_10; - REGISTER Simd Chi_11; - REGISTER Simd Chi_12; // 14 left - - REGISTER Simd UChi_00; // two spinor; 6 regs - REGISTER Simd UChi_01; - REGISTER Simd UChi_02; - - REGISTER Simd UChi_10; - REGISTER Simd UChi_11; - REGISTER Simd UChi_12; // 8 left - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. +#define ZERO_RESULT \ + result_00=zero; \ + result_01=zero; \ + result_02=zero; \ + result_10=zero; \ + result_11=zero; \ + result_12=zero; \ + result_20=zero; \ + result_21=zero; \ + result_22=zero; \ + result_30=zero; \ + result_31=zero; \ + result_32=zero; #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -370,475 +447,225 @@ WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge #define Chimu_31 UChi_11 #define Chimu_32 UChi_12 +namespace Grid { +namespace QCD { + +template void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); int offset,local,perm, ptype; StencilEntry *SE; - // Xp - SE=st.GetEntry(ptype,Xp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xp); - } - XM_RECON; - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YM_RECON_ACCUM; - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZM_RECON_ACCUM; - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TM_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XP_RECON_ACCUM; - - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YP_RECON_ACCUM; - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZP_RECON_ACCUM; - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TP_RECON_ACCUM; - - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); } template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) + int ss,int sU,const FermionField &in, FermionField &out) { - // std::cout << "Hand op Dhop "<_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} - { - MULT_2SPIN(Xp); - } - XP_RECON; +template void +WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YP_RECON_ACCUM; + HAND_DECLARATIONS(ignore); + int offset,local,perm, ptype; + StencilEntry *SE; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZP_RECON_ACCUM; +template +void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TP_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XM_RECON_ACCUM; - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YM_RECON_ACCUM; + HAND_DECLARATIONS(ignore); - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; + StencilEntry *SE; + int offset,local,perm, ptype; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZM_RECON_ACCUM; +template void +WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; + HAND_DECLARATIONS(ignore); - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TM_RECON_ACCUM; + int offset,local,perm, ptype; + StencilEntry *SE; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } +template +void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); } //////////////////////////////////////////////// // Specialise Gparity to simple implementation //////////////////////////////////////////////// -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) -{ - assert(0); -} - +#define HAND_SPECIALISE_EMPTY(IMPL) \ + template<> void \ + WilsonKernels::HandDhopSite(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDag(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteInt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteExt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDagInt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDagExt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + HAND_SPECIALISE_EMPTY(GparityWilsonImplF); + HAND_SPECIALISE_EMPTY(GparityWilsonImplD); + HAND_SPECIALISE_EMPTY(GparityWilsonImplFH); + HAND_SPECIALISE_EMPTY(GparityWilsonImplDF); ////////////// Wilson ; uses this implementation ///////////////////// -// Need Nc=3 though // #define INSTANTIATE_THEM(A) \ template void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \ -template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out);\ +template void WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); @@ -850,5 +677,15 @@ INSTANTIATE_THEM(DomainWallVec5dImplF); INSTANTIATE_THEM(DomainWallVec5dImplD); INSTANTIATE_THEM(ZDomainWallVec5dImplF); INSTANTIATE_THEM(ZDomainWallVec5dImplD); +INSTANTIATE_THEM(WilsonImplFH); +INSTANTIATE_THEM(WilsonImplDF); +INSTANTIATE_THEM(ZWilsonImplFH); +INSTANTIATE_THEM(ZWilsonImplDF); +INSTANTIATE_THEM(GparityWilsonImplFH); +INSTANTIATE_THEM(GparityWilsonImplDF); +INSTANTIATE_THEM(DomainWallVec5dImplFH); +INSTANTIATE_THEM(DomainWallVec5dImplDF); +INSTANTIATE_THEM(ZDomainWallVec5dImplFH); +INSTANTIATE_THEM(ZDomainWallVec5dImplDF); }} diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 5c16dd71..02348686 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -473,12 +473,12 @@ namespace Optimization { #define USE_FP16 struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { - __m256 h; + __m256i h; #ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); - h = _mm256_castps128_ps256(ha); - h = _mm256_insertf128_ps(h,hb,1); + h =(__m256i) _mm256_castps128_ps256((__m128)ha); + h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1); #else assert(0); #endif @@ -486,8 +486,8 @@ namespace Optimization { } static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { #ifdef USE_FP16 - sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); - sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); + sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0)); + sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1)); #else assert(0); #endif diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 757c84c9..cbca9118 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -33,6 +33,14 @@ #include "Grid_generic_types.h" // Definitions for simulated integer SIMD. namespace Grid { + +#ifdef QPX +#include +#include +#include +#include +#endif + namespace Optimization { typedef struct { diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 8a4537c2..2fb2df76 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -38,7 +38,6 @@ Author: neo #include - namespace Grid { namespace Optimization { @@ -333,26 +332,110 @@ namespace Optimization { #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) +#ifdef SFW_FP16 + + struct Grid_half { + Grid_half(){} + Grid_half(uint16_t raw) : x(raw) {} + uint16_t x; + }; + union FP32 { + unsigned int u; + float f; + }; + + // PAB - Lifted and adapted from Eigen, which is GPL V2 + inline float sfw_half_to_float(Grid_half h) { + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; + } + inline Grid_half sfw_float_to_half(float ff) { + FP32 f; f.f = ff; + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + Grid_half o; + + o.x = static_cast(0x0u); + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + o.x |= static_cast(sign >> 16); + return o; + } + static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) { + __m128i ret=(__m128i)_mm_setzero_ps(); + float *fp = (float *)&f; + Grid_half *hp = (Grid_half *)&ret; + hp[0] = sfw_float_to_half(fp[0]); + hp[1] = sfw_float_to_half(fp[1]); + hp[2] = sfw_float_to_half(fp[2]); + hp[3] = sfw_float_to_half(fp[3]); + return ret; + } + static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) { + __m128 ret=_mm_setzero_ps(); + float *fp = (float *)&ret; + Grid_half *hp = (Grid_half *)&h; + fp[0] = sfw_half_to_float(hp[0]); + fp[1] = sfw_half_to_float(hp[1]); + fp[2] = sfw_half_to_float(hp[2]); + fp[3] = sfw_half_to_float(hp[3]); + return ret; + } +#else +#define Grid_mm_cvtps_ph _mm_cvtps_ph +#define Grid_mm_cvtph_ps _mm_cvtph_ps +#endif struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { -#ifdef USE_FP16 - __m128i ha = _mm_cvtps_ph(a,0); - __m128i hb = _mm_cvtps_ph(b,0); + __m128i ha = Grid_mm_cvtps_ph(a,0); + __m128i hb = Grid_mm_cvtps_ph(b,0); __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); -#else - __m128i h = (__m128i)a; - assert(0); -#endif return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { -#ifdef USE_FP16 - sa = _mm_cvtph_ps(h); + sa = Grid_mm_cvtph_ps(h,0); h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); - sb = _mm_cvtph_ps(h); -#else - assert(0); -#endif + sb = Grid_mm_cvtph_ps(h,0); } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a); diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 7087d724..de4a13b5 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -53,12 +53,14 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#if defined QPX -#include "Grid_qpx.h" -#endif #ifdef NEONv8 #include "Grid_neon.h" #endif +#if defined QPX +#include "Grid_qpx.h" +#endif + +#include "l1p.h" namespace Grid { @@ -74,12 +76,14 @@ struct RealPart > { typedef T type; }; +#include + ////////////////////////////////////// // demote a vector to real type ////////////////////////////////////// // type alias used to simplify the syntax of std::enable_if template using Invoke = typename T::type; -template using EnableIf = Invoke >; +template using EnableIf = Invoke >; template using NotEnableIf = Invoke >; //////////////////////////////////////////////////////// @@ -88,13 +92,15 @@ template struct is_complex : public std::false_type {}; template <> struct is_complex > : public std::true_type {}; template <> struct is_complex > : public std::true_type {}; -template using IfReal = Invoke::value, int> >; -template using IfComplex = Invoke::value, int> >; -template using IfInteger = Invoke::value, int> >; +template using IfReal = Invoke::value, int> >; +template using IfComplex = Invoke::value, int> >; +template using IfInteger = Invoke::value, int> >; +template using IfSame = Invoke::value, int> >; -template using IfNotReal = Invoke::value, int> >; -template using IfNotComplex = Invoke::value, int> >; -template using IfNotInteger = Invoke::value, int> >; +template using IfNotReal = Invoke::value, int> >; +template using IfNotComplex = Invoke::value, int> >; +template using IfNotInteger = Invoke::value, int> >; +template using IfNotSame = Invoke::value, int> >; //////////////////////////////////////////////////////// // Define the operation templates functors diff --git a/lib/simd/l1p.h b/lib/simd/l1p.h new file mode 100644 index 00000000..8e43fdbd --- /dev/null +++ b/lib/simd/l1p.h @@ -0,0 +1,37 @@ +#pragma once +namespace Grid { +// L1p optimisation +inline void bgq_l1p_optimisation(int mode) +{ +#ifdef QPX +#undef L1P_CFG_PF_USR +#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ + + uint64_t cfg_pf_usr; + if ( mode ) { + cfg_pf_usr = + L1P_CFG_PF_USR_ifetch_depth(0) + | L1P_CFG_PF_USR_ifetch_max_footprint(1) + | L1P_CFG_PF_USR_pf_stream_est_on_dcbt + | L1P_CFG_PF_USR_pf_stream_establish_enable + | L1P_CFG_PF_USR_pf_stream_optimistic + | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; + // if ( sizeof(Float) == sizeof(double) ) { + cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; + // } else { + // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; + // } + } else { + cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) + | L1P_CFG_PF_USR_dfetch_max_footprint(2) + | L1P_CFG_PF_USR_ifetch_depth(0) + | L1P_CFG_PF_USR_ifetch_max_footprint(1) + | L1P_CFG_PF_USR_pf_stream_est_on_dcbt + | L1P_CFG_PF_USR_pf_stream_establish_enable + | L1P_CFG_PF_USR_pf_stream_optimistic + | L1P_CFG_PF_USR_pf_stream_prefetch_enable; + } + *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; +#endif +} +} diff --git a/lib/stencil/SimpleCompressor.h b/lib/stencil/SimpleCompressor.h new file mode 100644 index 00000000..58c3bcd2 --- /dev/null +++ b/lib/stencil/SimpleCompressor.h @@ -0,0 +1,29 @@ +#ifndef _STENCIL_SIMPLE_COMPRESSOR_H_ +#define _STENCIL_SIMPLE_COMPRESSOR_H_ + +namespace Grid { + +template +class SimpleCompressor { +public: + void Point(int) {}; + inline int CommDatumSize(void) { return sizeof(vobj); } + inline bool DecompressionStep(void) { return false; } + inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; } + inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, + int j,int k, int m,int type){ + exchange(out0[j],out1[j],in[k],in[m],type); + } + // For cshift. Cshift should drop compressor coupling altogether + // because I had to decouple the code from the Stencil anyway + inline vobj operator() (const vobj &arg) { + return arg; + } +}; + +} +#endif diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 479cd979..bcfbaa98 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -28,42 +28,24 @@ #ifndef GRID_STENCIL_H #define GRID_STENCIL_H +#include // subdir aggregate #include // subdir aggregate -#define NEW_XYZT_GATHER + ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need // additional stencil support. // - // Stencil based code will pre-exchange haloes and use a table lookup for neighbours. + // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. - // Overlap of comms and compute could be semi-automated by tabulating off-node connected, - // and - // - // Lattice could also allocate haloes which get used for stencil code. - // - // Grid could create a neighbour index table for a given stencil. - // - // Could also implement CovariantCshift, to fuse the loops and enhance performance. - // - // - // General stencil computation: - // + // Overlap of comms and compute is enabled by tabulating off-node connected, + // // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. - // // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each // - // Could take a - // SpinProjectFaces - // start comms - // complete comms - // Reconstruct Umu - // - // Approach. - // ////////////////////////////////////////////////////////////////////////////////////////// namespace Grid { @@ -82,7 +64,7 @@ void Gather_plane_simple_table (std::vector >& table,const La { int num=table.size(); parallel_for(int i=0;i >& table,const L int num=table.size()/2; int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane parallel_for(int j=0;j class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -132,11 +110,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; - - ////////////////////////////////////////// - // Comms packet queue for asynch thread - ////////////////////////////////////////// + /////////////////////////////////////////// + // Helper structs + /////////////////////////////////////////// struct Packet { void * send_buf; void * recv_buf; @@ -144,22 +121,134 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Integer from_rank; Integer bytes; }; - - std::vector Packets; - + struct Merge { + cobj * mpointer; + std::vector rpointers; + std::vector vpointers; + Integer buffer_size; + Integer type; + }; + struct Decompress { + cobj * kernel_p; + cobj * mpi_p; + Integer buffer_size; + }; + + //////////////////////////////////////// + // Basic Grid and stencil info + //////////////////////////////////////// int face_table_computed; std::vector > > face_table ; + - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - Packets.push_back(p); + int _checkerboard; + int _npoints; // Move to template param? + GridBase * _grid; + + // npoints of these + std::vector _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + Vector _entries; + std::vector Packets; + std::vector Mergers; + std::vector MergersSHM; + std::vector Decompressions; + std::vector DecompressionsSHM; + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + // Vectors that live on the symmetric heap in case of SHMEM + // These are used; either SHM objects or refs to the above symmetric heap vectors + // depending on comms target + cobj* u_recv_buf_p; + cobj* u_send_buf_p; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; + + int u_comm_offset; + int _unified_buffer_size; + cobj *CommBuf(void) { return u_recv_buf_p; } + + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// + double commtime; + double gathertime; + double gathermtime; + double halogtime; + double mergetime; + double decompresstime; + double comms_bytes; + double splicetime; + double nosplicetime; + double calls; + + //////////////////////////////////////// + // Stencil query + //////////////////////////////////////// + inline int SameNode(int point) { + + int dimension = _directions[point]; + int displacement = _distances[point]; + assert( (displacement==1) || (displacement==-1)); + + int pd = _grid->_processors[dimension]; + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + int recv_from_rank; + int xmit_to_rank; + + if ( ! comm_dim ) return 1; + + int nbr_proc; + if (displacement==1) nbr_proc = 1; + else nbr_proc = pd-1; + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); + + if ( shm==NULL ) return 0; + + return 1; + } + inline int GetNodeLocal(int osite,int point) { + return _entries[point+_npoints*osite]._is_local; + } + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } + inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + local = _entries[ent]._is_local; + perm = _entries[ent]._permute; + if (perm) ptype = _permute_type[point]; + if (local) { + return base + _entries[ent]._byte_offset; + } else { + return cbase + _entries[ent]._byte_offset; + } + } + + inline uint64_t GetPFInfo(int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return cbase + _entries[ent]._byte_offset; + } + + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); @@ -174,126 +263,173 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } commtime+=usecond(); } + void CommunicateComplete(std::vector > &reqs) { commtime-=usecond(); for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); } - _grid->StencilBarrier();// Synch shared memory on a single nodes commtime+=usecond(); - /* - int dump=1; - if(dump){ - for(int i=0;i_ndimension;d++){ - ss<<"."<<_grid->_processor_coor[d]; - } - ss<<"_mu_"< void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + Prepare(); + HaloGather(source,compress); + CommunicateBegin(reqs); + CommunicateComplete(reqs); + CommsMergeSHM(compress); + CommsMerge(compress); + } + + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int same_node = 1; + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + same_node = same_node && GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + // if checkerboard is unfavourable take two passes + // both with block stride loop iteration + same_node = same_node && GatherSimd(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && GatherSimd(source,dimension,shift,0x2,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } } } - dump =0; -*/ + return same_node; } - - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - std::vector vpointers; - Integer buffer_size; - Integer packet_id; - Integer exchange; - Integer type; - }; - std::vector Mergers; + template + void HaloGather(const Lattice &source,compressor &compress) + { + _grid->StencilBarrier();// Synch shared memory on a single nodes - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { - Merge m; - m.exchange = 0; - m.mpointer = merge_p; - m.rpointers= rpointers; - m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); } - - void AddMergeNew(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id,Integer type) { + + ///////////////////////// + // Implementation + ///////////////////////// + void Prepare(void) + { + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); + Packets.resize(0); + calls++; + } + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + Packets.push_back(p); + } + void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector &dv) { + Decompress d; + d.kernel_p = k_p; + d.mpi_p = m_p; + d.buffer_size = buffer_size; + dv.push_back(d); + } + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; - m.exchange = 1; m.type = type; m.mpointer = merge_p; m.vpointers= rpointers; m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); + mv.push_back(m); + } + template void CommsMerge(decompressor decompress) { + CommsMerge(decompress,Mergers,Decompressions); + } + template void CommsMergeSHM(decompressor decompress) { + _grid->StencilBarrier();// Synch shared memory on a single nodes + CommsMerge(decompress,MergersSHM,DecompressionsSHM); } - void CommsMerge(void ) { + template + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { - for(int i=0;i_ndimension;d++){ - // ss<<"."<<_grid->_processor_coor[d]; - // } - // ss<<"_m_"< _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; - - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - void PrecomputeByteOffsets(void){ for(int i=0;i<_entries.size();i++){ if( _entries[i]._is_local ) { @@ -304,113 +440,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; - local = _entries[ent]._is_local; - perm = _entries[ent]._permute; - if (perm) ptype = _permute_type[point]; - if (local) { - return base + _entries[ent]._byte_offset; - } else { - return cbase + _entries[ent]._byte_offset; - } - } - inline uint64_t GetPFInfo(int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; - int local = _entries[ent]._is_local; - if (local) return base + _entries[ent]._byte_offset; - else return cbase + _entries[ent]._byte_offset; - } - - /////////////////////////////////////////////////////////// - // Unified Comms buffers for all directions - /////////////////////////////////////////////////////////// - // Vectors that live on the symmetric heap in case of SHMEM - // std::vector > u_simd_send_buf_hide; - // std::vector > u_simd_recv_buf_hide; - // commVector u_send_buf_hide; - // commVector u_recv_buf_hide; - - // These are used; either SHM objects or refs to the above symmetric heap vectors - // depending on comms target - cobj* u_recv_buf_p; - cobj* u_send_buf_p; - std::vector new_simd_send_buf; - std::vector new_simd_recv_buf; - std::vector u_simd_send_buf; - std::vector u_simd_recv_buf; - - int u_comm_offset; - int _unified_buffer_size; - - cobj *CommBuf(void) { return u_recv_buf_p; } - - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// -#define TIMING_HACK -#ifdef TIMING_HACK - double jointime; - double gathertime; - double commtime; - double halogtime; - double mergetime; - double spintime; - double comms_bytes; - double gathermtime; - double splicetime; - double nosplicetime; - double t_data; - double t_table; - double calls; - - void ZeroCounters(void) { - gathertime = 0.; - jointime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - spintime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - t_data = 0.0; - t_table= 0.0; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { -#define PRINTIT(A) \ - std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; - RealD NN = _grid->NodeCount(); - - _grid->GlobalSum(commtime); commtime/=NP; - if ( calls > 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); -#ifdef NEW_XYZT_GATHER + for(int l=0;l<2;l++){ - new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); - new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } -#else - for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - } -#endif PrecomputeByteOffsets(); } @@ -611,7 +632,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal offnode = (comm_proc!= 0); } - int wraparound=0; if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -735,106 +755,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } } - template void HaloExchange(const Lattice &source,compressor &compress) - { - std::vector > reqs; - calls++; - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - CommsMerge(); - } - - template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - // GatherSimd(source,dimension,shift,0x3,compress,face_idx); - // std::cout << "GatherSimdNew"< - void HaloGather(const Lattice &source,compressor &compress) - { - _grid->StencilBarrier();// Synch shared memory on a single nodes - - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; @@ -858,6 +780,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + int shm_receive_only = 1; for(int x=0;x>1; - int bytes = words * sizeof(cobj); + int bytes = words * compress.CommDatumSize(); - gathertime-=usecond(); int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { - t_table-=usecond(); face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); - // std::cout << " face table size "<CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x= rd ); - - if ( any_offnode ) { - - for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); - gathermtime+=usecond(); - - for(int i=0;i2 - // for(int w=0;w : lane " << i <<" elem "<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - int nbr_lcoor= (nbr_coor%ld); - int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer - int nbr_ox = (nbr_lcoor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - - if(nbr_proc){ - - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - // shm == receive pointer if offnode - // shm == Translate[send pointer] if on node -- my view of his send pointer - scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { - shm = rp; - } - - // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node - // assuming above pointer flip - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - rpointers[i] = shm; - - } else { - - rpointers[i] = sp; - - } - } - - AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); - - u_comm_offset +=buffer_size; - } - } - } - - - template - void GatherSimdNew(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { const int Nsimd = _grid->Nsimd(); @@ -1063,8 +899,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int reduced_buffer_size = buffer_size; if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1; - int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout; - assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj)); + int datum_bytes = compress.CommDatumSize(); + int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; + assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); std::vector rpointers(maxl); std::vector spointers(maxl); @@ -1077,33 +914,26 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); // loop over outer coord planes orthog to dim + int shm_receive_only = 1; for(int x=0;x= rd ); if ( any_offnode ) { - for(int i=0;i > table; - t_table-=usecond(); if ( !face_table_computed ) { face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); - // std::cout << " face table size "<_Nprocessors; + RealD NN = _grid->NodeCount(); + + _grid->GlobalSum(commtime); commtime/=NP; + if ( calls > 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"< &o) { + friend std::ostream &operator<<(std::ostream &stream,const iScalar &o) { stream << "S {" << o._internal << "}"; return stream; }; + + }; /////////////////////////////////////////////////////////// // Allows to turn scalar>>> back to double. diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 80009391..af82de72 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -142,6 +142,17 @@ namespace Grid { typedef vRealD Realified; enum { TensorLevel = 0 }; }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef vComplexH vector_type; + typedef vComplexD vector_typeD; + typedef vComplexH tensor_reduced; + typedef ComplexF scalar_object; + typedef vComplexH Complexified; + typedef vRealH Realified; + enum { TensorLevel = 0 }; + }; template<> class GridTypeMapper { public: typedef ComplexF scalar_type; diff --git a/scripts/grep-global b/scripts/grep-global new file mode 100755 index 00000000..a95a0549 --- /dev/null +++ b/scripts/grep-global @@ -0,0 +1,6 @@ +#!/bin/bash + +export LANG=C +find . -name "*.cc" -exec grep -H $@ {} \; +find . -name "*.h" -exec grep -H $@ {} \; +