From 9e2d29c64429c615c8e366c23b54d1d498f4af7a Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 14 Apr 2017 14:17:14 +0100 Subject: [PATCH 01/31] USE_FP16 macro --- lib/simd/Grid_avx.h | 5 +++-- lib/simd/Grid_avx512.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 73792f84..61bf0dfe 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -470,13 +470,14 @@ namespace Optimization { return in; }; }; - +#define USE_FP16 struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { + __m256 h ; #ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); - __m256 h = _mm256_castps128_ps256(ha); + h = _mm256_castps128_ps256(ha); h = _mm256_insertf128_ps(h,hb,1); #else assert(0); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index dae3c1c7..a15edeb3 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -340,13 +340,14 @@ namespace Optimization { }; }; - +#define USE_FP16 struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { + __m512 h ; #ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); - __m512 h = _mm512_castps256_ps512(ha); + h = _mm512_castps256_ps512(ha); h = _mm512_insertf256_ps(h,hb,1); #else assert(0); From 557c3fa109bea1e0be700a4a2b5fc4c7757ab642 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 13:27:38 +0100 Subject: [PATCH 02/31] Pretty change --- lib/tensors/Tensor_class.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index 294d25da..6ae8e090 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -164,11 +164,12 @@ class iScalar { return *this; } - friend std::ostream &operator<<(std::ostream &stream, - const iScalar &o) { + friend std::ostream &operator<<(std::ostream &stream,const iScalar &o) { stream << "S {" << o._internal << "}"; return stream; }; + + }; /////////////////////////////////////////////////////////// // Allows to turn scalar>>> back to double. From 3b7de792d5c3628bf9f0c4df64962a76a05cf5ac Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 13:28:04 +0100 Subject: [PATCH 03/31] Type comparison in the traits work --- lib/simd/Grid_vector_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 7087d724..1bafe12e 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -91,10 +91,12 @@ template <> struct is_complex > : public std::true_type {}; template using IfReal = Invoke::value, int> >; template using IfComplex = Invoke::value, int> >; template using IfInteger = Invoke::value, int> >; +template using IfSame = Invoke::value, int> >; template using IfNotReal = Invoke::value, int> >; template using IfNotComplex = Invoke::value, int> >; template using IfNotInteger = Invoke::value, int> >; +template using IfNotSame = Invoke::value, int> >; //////////////////////////////////////////////////////// // Define the operation templates functors From 4a340aa5caf5de5e189d46f1c3101b798e839315 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 09:28:27 +0100 Subject: [PATCH 04/31] Massive compressor rework to support reduced precision comms --- lib/cshift/Cshift_common.h | 16 +- lib/qcd/action/fermion/FermionOperatorImpl.h | 227 +++++---- lib/qcd/action/fermion/WilsonCompressor.h | 470 ++++++++++-------- lib/qcd/action/fermion/WilsonFermion5D.cc | 2 +- lib/simd/Grid_vector_types.h | 20 +- lib/stencil/Stencil.h | 473 ++++++------------- 6 files changed, 591 insertions(+), 617 deletions(-) diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 48708800..956f6cf1 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -34,8 +34,20 @@ template class SimpleCompressor { public: void Point(int) {}; - - vobj operator() (const vobj &arg) { + inline int CommDatumSize(void) { return sizeof(vobj); } + inline bool DecompressionStep(void) { return false; } + inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; } + inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, + int j,int k, int m,int type){ + exchange(out0[j],out1[j],in[k],in[m],type); + } + // For cshift. Cshift should drop compressor coupling altogether + // because I had to decouple the code from the Stencil anyway + inline vobj operator() (const vobj &arg) { return arg; } }; diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 141d808d..0f318366 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -35,7 +35,6 @@ directory namespace Grid { namespace QCD { - ////////////////////////////////////////////// // Template parameter class constructs to package // externally control Fermion implementations @@ -89,7 +88,53 @@ namespace QCD { // // } ////////////////////////////////////////////// - + + template struct SamePrecisionMapper { + typedef T HigherPrecVector ; + typedef T LowerPrecVector ; + }; + template struct LowerPrecisionMapper { }; + template <> struct LowerPrecisionMapper { + typedef vRealF HigherPrecVector ; + typedef vRealH LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vRealD HigherPrecVector ; + typedef vRealF LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vComplexF HigherPrecVector ; + typedef vComplexH LowerPrecVector ; + }; + template <> struct LowerPrecisionMapper { + typedef vComplexD HigherPrecVector ; + typedef vComplexF LowerPrecVector ; + }; + + struct CoeffReal { + public: + typedef RealD _Coeff_t; + static const int Nhcs = 2; + template using PrecisionMapper = SamePrecisionMapper; + }; + struct CoeffRealHalfComms { + public: + typedef RealD _Coeff_t; + static const int Nhcs = 1; + template using PrecisionMapper = LowerPrecisionMapper; + }; + struct CoeffComplex { + public: + typedef ComplexD _Coeff_t; + static const int Nhcs = 2; + template using PrecisionMapper = SamePrecisionMapper; + }; + struct CoeffComplexHalfComms { + public: + typedef ComplexD _Coeff_t; + static const int Nhcs = 1; + template using PrecisionMapper = LowerPrecisionMapper; + }; //////////////////////////////////////////////////////////////////////// // Implementation dependent fermion types @@ -114,37 +159,40 @@ namespace QCD { ///////////////////////////////////////////////////////////////////////////// // Single flavour four spinors with colour index ///////////////////////////////////////////////////////////////////////////// - template + template class WilsonImpl : public PeriodicGaugeImpl > { - public: static const int Dimension = Representation::Dimension; + static const bool LsVectorised=false; + static const int Nhcs = Options::Nhcs; + typedef PeriodicGaugeImpl > Gimpl; + INHERIT_GIMPL_TYPES(Gimpl); //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - const bool LsVectorised=false; - typedef _Coeff_t Coeff_t; - - INHERIT_GIMPL_TYPES(Gimpl); + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; template using iImplSpinor = iScalar, Ns> >; template using iImplPropagator = iScalar, Ns> >; template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplHalfCommSpinor = iScalar, Nhcs> >; template using iImplDoubledGaugeField = iVector >, Nds>; typedef iImplSpinor SiteSpinor; typedef iImplPropagator SitePropagator; typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice FermionField; typedef Lattice PropagatorField; typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; @@ -209,31 +257,34 @@ namespace QCD { //////////////////////////////////////////////////////////////////////////////////// // Single flavour four spinors with colour index, 5d redblack //////////////////////////////////////////////////////////////////////////////////// - -template +template class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { public: - - static const int Dimension = Nrepresentation; - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; + typedef PeriodicGaugeImpl > Gimpl; - INHERIT_GIMPL_TYPES(Gimpl); + + static const int Dimension = Nrepresentation; + static const bool LsVectorised=true; + static const int Nhcs = Options::Nhcs; + + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; template using iImplSpinor = iScalar, Ns> >; template using iImplPropagator = iScalar, Ns> >; template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplHalfCommSpinor = iScalar, Nhcs> >; template using iImplDoubledGaugeField = iVector >, Nds>; template using iImplGaugeField = iVector >, Nd>; template using iImplGaugeLink = iScalar > >; - typedef iImplSpinor SiteSpinor; - typedef iImplPropagator SitePropagator; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice FermionField; - typedef Lattice PropagatorField; - + typedef iImplSpinor SiteSpinor; + typedef iImplPropagator SitePropagator; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; + typedef Lattice FermionField; + typedef Lattice PropagatorField; ///////////////////////////////////////////////// // Make the doubled gauge field a *scalar* @@ -241,9 +292,9 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar typedef iImplGaugeField SiteScalarGaugeField; // scalar typedef iImplGaugeLink SiteScalarGaugeLink; // scalar - typedef Lattice DoubledGaugeField; + typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; @@ -311,35 +362,37 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - -template +template class GparityWilsonImpl : public ConjugateGaugeImpl > { public: static const int Dimension = Nrepresentation; + static const int Nhcs = Options::Nhcs; + static const bool LsVectorised=false; - const bool LsVectorised=false; - - typedef _Coeff_t Coeff_t; typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; - INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Options::_Coeff_t Coeff_t; + typedef typename Options::template PrecisionMapper::LowerPrecVector SimdL; - template using iImplSpinor = iVector, Ns>, Ngp>; - template using iImplPropagator = iVector, Ns>, Ngp >; - template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplSpinor = iVector, Ns>, Ngp>; + template using iImplPropagator = iVector, Ns>, Ngp>; + template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplHalfCommSpinor = iVector, Nhcs>, Ngp>; template using iImplDoubledGaugeField = iVector >, Nds>, Ngp>; - - typedef iImplSpinor SiteSpinor; - typedef iImplPropagator SitePropagator; - typedef iImplHalfSpinor SiteHalfSpinor; + + typedef iImplSpinor SiteSpinor; + typedef iImplPropagator SitePropagator; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplHalfCommSpinor SiteHalfCommSpinor; typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice FermionField; typedef Lattice PropagatorField; typedef Lattice DoubledGaugeField; - typedef WilsonCompressor Compressor; + typedef WilsonCompressor Compressor; typedef WilsonStencil StencilImpl; typedef GparityWilsonImplParams ImplParams; @@ -356,8 +409,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl - class StaggeredImpl : public PeriodicGaugeImpl > { +///////////////////////////////////////////////////////////////////////////// +// Single flavour one component spinors with colour index +///////////////////////////////////////////////////////////////////////////// +template +class StaggeredImpl : public PeriodicGaugeImpl > { public: typedef RealD _Coeff_t ; static const int Dimension = Representation::Dimension; + static const bool LsVectorised=false; typedef PeriodicGaugeImpl > Gimpl; //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - const bool LsVectorised=false; typedef _Coeff_t Coeff_t; INHERIT_GIMPL_TYPES(Gimpl); @@ -641,8 +692,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl > Gimpl; //Necessary? constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; INHERIT_GIMPL_TYPES(Gimpl); @@ -823,43 +870,61 @@ class GparityWilsonImpl : public ConjugateGaugeImpl WilsonImplR; // Real.. whichever prec +typedef WilsonImpl WilsonImplF; // Float +typedef WilsonImpl WilsonImplD; // Double +typedef WilsonImpl WilsonImplRH; // Real.. whichever prec +typedef WilsonImpl WilsonImplFH; // Float +typedef WilsonImpl WilsonImplDH; // Double - typedef WilsonImpl WilsonImplR; // Real.. whichever prec - typedef WilsonImpl WilsonImplF; // Float - typedef WilsonImpl WilsonImplD; // Double +typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplF; // Float +typedef WilsonImpl ZWilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec - typedef WilsonImpl ZWilsonImplF; // Float - typedef WilsonImpl ZWilsonImplD; // Double +typedef WilsonImpl ZWilsonImplRH; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplFH; // Float +typedef WilsonImpl ZWilsonImplDH; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec - typedef WilsonImpl WilsonAdjImplF; // Float - typedef WilsonImpl WilsonAdjImplD; // Double +typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec +typedef WilsonImpl WilsonAdjImplF; // Float +typedef WilsonImpl WilsonAdjImplD; // Double - typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec - typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float - typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double +typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec +typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float +typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double - typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float +typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplRH; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplFH; // Float +typedef DomainWallVec5dImpl DomainWallVec5dImplDH; // Double - typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec - typedef GparityWilsonImpl GparityWilsonImplF; // Float - typedef GparityWilsonImpl GparityWilsonImplD; // Double +typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float +typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + +typedef DomainWallVec5dImpl ZDomainWallVec5dImplRH; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplFH; // Float +typedef DomainWallVec5dImpl ZDomainWallVec5dImplDH; // Double + +typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplF; // Float +typedef GparityWilsonImpl GparityWilsonImplD; // Double + +typedef GparityWilsonImpl GparityWilsonImplRH; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplFH; // Float +typedef GparityWilsonImpl GparityWilsonImplDH; // Double - typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec - typedef StaggeredImpl StaggeredImplF; // Float - typedef StaggeredImpl StaggeredImplD; // Double +typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec +typedef StaggeredImpl StaggeredImplF; // Float +typedef StaggeredImpl StaggeredImplD; // Double - typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec - typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float - typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double +typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec +typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float +typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double }} diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index c3b4dffb..f1ad390d 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -33,228 +33,292 @@ Author: paboyle namespace Grid { namespace QCD { - template - class WilsonCompressor { - public: - int mu; - int dag; +///////////////////////////////////////////////////////////////////////////////////////////// +// optimised versions supporting half precision too +///////////////////////////////////////////////////////////////////////////////////////////// - WilsonCompressor(int _dag){ - mu=0; - dag=_dag; - assert((dag==0)||(dag==1)); - } - void Point(int p) { - mu=p; - }; +template +class WilsonCompressorTemplate; - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - int mudag=mu; - if (!dag) { - mudag=(mu+Nd)%(2*Nd); - } - switch(mudag) { - case Xp: - spProjXp(ret,in); - break; - case Yp: - spProjYp(ret,in); - break; - case Zp: - spProjZp(ret,in); - break; - case Tp: - spProjTp(ret,in); - break; - case Xm: - spProjXm(ret,in); - break; - case Ym: - spProjYm(ret,in); - break; - case Zm: - spProjZm(ret,in); - break; - case Tm: - spProjTm(ret,in); - break; - default: - assert(0); - break; - } - return ret; - } - }; - ///////////////////////// - // optimised versions - ///////////////////////// +template +class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, + typename std::enable_if::value>::type > +{ + public: + + int mu,dag; - template - class WilsonXpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjXp(ret,in); - return ret; - } - }; - template - class WilsonYpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjYp(ret,in); - return ret; - } - }; - template - class WilsonZpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjZp(ret,in); - return ret; - } - }; - template - class WilsonTpCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjTp(ret,in); - return ret; - } - }; + void Point(int p) { mu=p; }; - template - class WilsonXmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjXm(ret,in); - return ret; - } - }; - template - class WilsonYmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjYm(ret,in); - return ret; - } - }; - template - class WilsonZmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjZm(ret,in); - return ret; - } - }; - template - class WilsonTmCompressor { - public: - inline SiteHalfSpinor operator () (const SiteSpinor &in) { - SiteHalfSpinor ret; - spProjTm(ret,in); - return ret; - } - }; + WilsonCompressorTemplate(int _dag=0){ + dag = _dag; + } - // Fast comms buffer manipulation which should inline right through (avoid direction - // dependent logic that prevents inlining - template - class WilsonStencil : public CartesianStencil { - public: + typedef _Spinor SiteSpinor; + typedef _Hspinor SiteHalfSpinor; + typedef _HCspinor SiteHalfCommSpinor; + typedef typename SiteHalfCommSpinor::vector_type vComplexLow; + typedef typename SiteHalfSpinor::vector_type vComplexHigh; + constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + inline int CommDatumSize(void) { + return sizeof(SiteHalfCommSpinor); + } - WilsonStencil(GridBase *grid, + /*****************************************************/ + /* Compress includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { + projector::Proj(buf[o],in,mu,dag); + } + + /*****************************************************/ + /* Exchange includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Exchange(SiteHalfSpinor *mp, + SiteHalfSpinor *vp0, + SiteHalfSpinor *vp1, + Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + + /*****************************************************/ + /* Have a decompression step if mpi data is not same */ + /*****************************************************/ + inline void Decompress(SiteHalfSpinor *out, + SiteHalfSpinor *in, Integer o) { + assert(0); + } + + /*****************************************************/ + /* Compress Exchange */ + /*****************************************************/ + inline void CompressExchange(SiteHalfSpinor *out0, + SiteHalfSpinor *out1, + const SiteSpinor *in, + Integer j,Integer k, Integer m,Integer type){ + SiteHalfSpinor temp1, temp2,temp3,temp4; + projector::Proj(temp1,in[k],mu,dag); + projector::Proj(temp2,in[m],mu,dag); + exchange(out0[j],out1[j],temp1,temp2,type); + } + + /*****************************************************/ + /* Pass the info to the stencil */ + /*****************************************************/ + inline bool DecompressionStep(void) { return false; } + +}; + +template +class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, + typename std::enable_if::value>::type > +{ + public: + + int mu,dag; + + void Point(int p) { mu=p; }; + + WilsonCompressorTemplate(int _dag=0){ + dag = _dag; + } + + typedef _Spinor SiteSpinor; + typedef _Hspinor SiteHalfSpinor; + typedef _HCspinor SiteHalfCommSpinor; + typedef typename SiteHalfCommSpinor::vector_type vComplexLow; + typedef typename SiteHalfSpinor::vector_type vComplexHigh; + constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); + + inline int CommDatumSize(void) { + return sizeof(SiteHalfCommSpinor); + } + + /*****************************************************/ + /* Compress includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { + SiteHalfSpinor hsp; + SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; + projector::Proj(hsp,in,mu,dag); + precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); + } + + /*****************************************************/ + /* Exchange includes precision change if mpi data is not same */ + /*****************************************************/ + inline void Exchange(SiteHalfSpinor *mp, + SiteHalfSpinor *vp0, + SiteHalfSpinor *vp1, + Integer type,Integer o){ + SiteHalfSpinor vt0,vt1; + SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; + SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; + precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw); + precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw); + exchange(mp[2*o],mp[2*o+1],vt0,vt1,type); + } + + /*****************************************************/ + /* Have a decompression step if mpi data is not same */ + /*****************************************************/ + inline void Decompress(SiteHalfSpinor *out, + SiteHalfSpinor *in, Integer o){ + SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; + precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); + } + + /*****************************************************/ + /* Compress Exchange */ + /*****************************************************/ + inline void CompressExchange(SiteHalfSpinor *out0, + SiteHalfSpinor *out1, + const SiteSpinor *in, + Integer j,Integer k, Integer m,Integer type){ + SiteHalfSpinor temp1, temp2,temp3,temp4; + SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; + SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; + projector::Proj(temp1,in[k],mu,dag); + projector::Proj(temp2,in[m],mu,dag); + exchange(temp3,temp4,temp1,temp2,type); + precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw); + precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw); + } + + /*****************************************************/ + /* Pass the info to the stencil */ + /*****************************************************/ + inline bool DecompressionStep(void) { return true; } + +}; + +#define DECLARE_PROJ(Projector,Compressor,spProj) \ + class Projector { \ + public: \ + template \ + static void Proj(hsp &result,const fsp &in,int mu,int dag){ \ + spProj(result,in); \ + } \ + }; \ +template using Compressor = WilsonCompressorTemplate; + +DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp); +DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp); +DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp); +DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp); +DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm); +DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm); +DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm); +DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm); + +class WilsonProjector { + public: + template + static void Proj(hsp &result,const fsp &in,int mu,int dag){ + int mudag=dag? mu : (mu+Nd)%(2*Nd); + switch(mudag) { + case Xp: spProjXp(result,in); break; + case Yp: spProjYp(result,in); break; + case Zp: spProjZp(result,in); break; + case Tp: spProjTp(result,in); break; + case Xm: spProjXm(result,in); break; + case Ym: spProjYm(result,in); break; + case Zm: spProjZm(result,in); break; + case Tm: spProjTm(result,in); break; + default: assert(0); break; + } + } +}; +template using WilsonCompressor = WilsonCompressorTemplate; + +// Fast comms buffer manipulation which should inline right through (avoid direction +// dependent logic that prevents inlining +template +class WilsonStencil : public CartesianStencil { +public: + + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + + WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, - const std::vector &distances) : CartesianStencil (grid,npoints,checkerboard,directions,distances) - { }; + const std::vector &distances) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) + { /*Do nothing*/ }; - template < class compressor> - void HaloExchangeOpt(const Lattice &source,compressor &compress) - { - std::vector > reqs; - HaloExchangeOptGather(source,compress); - this->CommunicateBegin(reqs); - this->calls++; - this->CommunicateComplete(reqs); - this->CommsMerge(); - } + template < class compressor> + void HaloExchangeOpt(const Lattice &source,compressor &compress) + { + std::vector > reqs; + this->HaloExchangeOptGather(source,compress); + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); + this->CommsMerge(compress); + } + + template + void HaloExchangeOptGather(const Lattice &source,compressor &compress) + { + this->Prepare(); + this->HaloGatherOpt(source,compress); + } - template < class compressor> - void HaloExchangeOptGather(const Lattice &source,compressor &compress) - { - this->calls++; - this->Mergers.resize(0); - this->Packets.resize(0); - this->HaloGatherOpt(source,compress); - } + template + void HaloGatherOpt(const Lattice &source,compressor &compress) + { + // Strategy. Inherit types from Compressor. + // Use types to select the write direction by directon compressor + typedef typename compressor::SiteSpinor SiteSpinor; + typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; + typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; + this->_grid->StencilBarrier(); - template < class compressor> - void HaloGatherOpt(const Lattice &source,compressor &compress) - { - this->_grid->StencilBarrier(); - // conformable(source._grid,_grid); - assert(source._grid==this->_grid); - this->halogtime-=usecond(); + assert(source._grid==this->_grid); + this->halogtime-=usecond(); + + this->u_comm_offset=0; - this->u_comm_offset=0; - - int dag = compress.dag; - - WilsonXpCompressor XpCompress; - WilsonYpCompressor YpCompress; - WilsonZpCompressor ZpCompress; - WilsonTpCompressor TpCompress; - WilsonXmCompressor XmCompress; - WilsonYmCompressor YmCompress; - WilsonZmCompressor ZmCompress; - WilsonTmCompressor TmCompress; + WilsonXpCompressor XpCompress; + WilsonYpCompressor YpCompress; + WilsonZpCompressor ZpCompress; + WilsonTpCompressor TpCompress; + WilsonXmCompressor XmCompress; + WilsonYmCompressor YmCompress; + WilsonZmCompressor ZmCompress; + WilsonTmCompressor TmCompress; - // Gather all comms buffers - // for(int point = 0 ; point < _npoints; point++) { - // compress.Point(point); - // HaloGatherDir(source,compress,point,face_idx); - // } - int face_idx=0; - if ( dag ) { - // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); - this->HaloGatherDir(source,YpCompress,Yp,face_idx); - this->HaloGatherDir(source,ZpCompress,Zp,face_idx); - this->HaloGatherDir(source,TpCompress,Tp,face_idx); - this->HaloGatherDir(source,XmCompress,Xm,face_idx); - this->HaloGatherDir(source,YmCompress,Ym,face_idx); - this->HaloGatherDir(source,ZmCompress,Zm,face_idx); - this->HaloGatherDir(source,TmCompress,Tm,face_idx); - } else { - this->HaloGatherDir(source,XmCompress,Xp,face_idx); - this->HaloGatherDir(source,YmCompress,Yp,face_idx); - this->HaloGatherDir(source,ZmCompress,Zp,face_idx); - this->HaloGatherDir(source,TmCompress,Tp,face_idx); - this->HaloGatherDir(source,XpCompress,Xm,face_idx); - this->HaloGatherDir(source,YpCompress,Ym,face_idx); - this->HaloGatherDir(source,ZpCompress,Zm,face_idx); - this->HaloGatherDir(source,TpCompress,Tm,face_idx); - } - this->face_table_computed=1; - assert(this->u_comm_offset==this->_unified_buffer_size); - this->halogtime+=usecond(); + int dag = compress.dag; + int face_idx=0; + if ( dag ) { + // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); + this->HaloGatherDir(source,YpCompress,Yp,face_idx); + this->HaloGatherDir(source,ZpCompress,Zp,face_idx); + this->HaloGatherDir(source,TpCompress,Tp,face_idx); + this->HaloGatherDir(source,XmCompress,Xm,face_idx); + this->HaloGatherDir(source,YmCompress,Ym,face_idx); + this->HaloGatherDir(source,ZmCompress,Zm,face_idx); + this->HaloGatherDir(source,TmCompress,Tm,face_idx); + } else { + this->HaloGatherDir(source,XmCompress,Xp,face_idx); + this->HaloGatherDir(source,YmCompress,Yp,face_idx); + this->HaloGatherDir(source,ZmCompress,Zp,face_idx); + this->HaloGatherDir(source,TmCompress,Tp,face_idx); + this->HaloGatherDir(source,XpCompress,Xm,face_idx); + this->HaloGatherDir(source,YpCompress,Ym,face_idx); + this->HaloGatherDir(source,ZpCompress,Zm,face_idx); + this->HaloGatherDir(source,TpCompress,Tm,face_idx); } + this->face_table_computed=1; + assert(this->u_comm_offset==this->_unified_buffer_size); + this->halogtime+=usecond(); + } - }; - + }; }} // namespace close #endif diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 88bc425a..d29142d9 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -439,7 +439,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } DhopFaceTime-=usecond(); - st.CommsMerge(); + st.CommsMerge(compressor); DhopFaceTime+=usecond(); #pragma omp parallel diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 1bafe12e..ac17d1e0 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -74,12 +74,14 @@ struct RealPart > { typedef T type; }; +#include + ////////////////////////////////////// // demote a vector to real type ////////////////////////////////////// // type alias used to simplify the syntax of std::enable_if template using Invoke = typename T::type; -template using EnableIf = Invoke >; +template using EnableIf = Invoke >; template using NotEnableIf = Invoke >; //////////////////////////////////////////////////////// @@ -88,15 +90,15 @@ template struct is_complex : public std::false_type {}; template <> struct is_complex > : public std::true_type {}; template <> struct is_complex > : public std::true_type {}; -template using IfReal = Invoke::value, int> >; -template using IfComplex = Invoke::value, int> >; -template using IfInteger = Invoke::value, int> >; -template using IfSame = Invoke::value, int> >; +template using IfReal = Invoke::value, int> >; +template using IfComplex = Invoke::value, int> >; +template using IfInteger = Invoke::value, int> >; +template using IfSame = Invoke::value, int> >; -template using IfNotReal = Invoke::value, int> >; -template using IfNotComplex = Invoke::value, int> >; -template using IfNotInteger = Invoke::value, int> >; -template using IfNotSame = Invoke::value, int> >; +template using IfNotReal = Invoke::value, int> >; +template using IfNotComplex = Invoke::value, int> >; +template using IfNotInteger = Invoke::value, int> >; +template using IfNotSame = Invoke::value, int> >; //////////////////////////////////////////////////////// // Define the operation templates functors diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 479cd979..268331fa 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -29,7 +29,7 @@ #define GRID_STENCIL_H #include // subdir aggregate -#define NEW_XYZT_GATHER + ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need @@ -82,7 +82,7 @@ void Gather_plane_simple_table (std::vector >& table,const La { int num=table.size(); parallel_for(int i=0;i >& table,const L int num=table.size()/2; int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane parallel_for(int j=0;j class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -132,7 +137,61 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; + + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// + double commtime; + double gathertime; + double gathermtime; + double halogtime; + double mergetime; + double decompresstime; + double comms_bytes; + double splicetime; + double nosplicetime; + double calls; + void ZeroCounters(void) { + gathertime = 0.; + commtime = 0.; + halogtime = 0.; + mergetime = 0.; + decompresstime = 0.; + gathermtime = 0.; + splicetime = 0.; + nosplicetime = 0.; + comms_bytes = 0.; + calls = 0.; + }; + + void Report(void) { +#define AVERAGE(A) _grid->GlobalSum(A);A/=NP; +#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; + RealD NN = _grid->NodeCount(); + + _grid->GlobalSum(commtime); commtime/=NP; + if ( calls > 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<StencilBarrier();// Synch shared memory on a single nodes commtime+=usecond(); - /* - int dump=1; - if(dump){ - for(int i=0;i_ndimension;d++){ - ss<<"."<<_grid->_processor_coor[d]; - } - ss<<"_mu_"< rpointers; std::vector vpointers; Integer buffer_size; - Integer packet_id; - Integer exchange; Integer type; }; std::vector Mergers; - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { - Merge m; - m.exchange = 0; - m.mpointer = merge_p; - m.rpointers= rpointers; - m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); + struct Decompress { + cobj * kernel_p; + cobj * mpi_p; + Integer buffer_size; + }; + + void Prepare(void) + { + Decompressions.resize(0); + Mergers.resize(0); + Packets.resize(0); + calls++; + } + std::vector Decompressions; + + void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size) { + Decompress d; + d.kernel_p = k_p; + d.mpi_p = m_p; + d.buffer_size = buffer_size; + Decompressions.push_back(d); } - void AddMergeNew(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id,Integer type) { + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type) { Merge m; - m.exchange = 1; m.type = type; m.mpointer = merge_p; m.vpointers= rpointers; m.buffer_size = buffer_size; - m.packet_id = packet_id; Mergers.push_back(m); } - void CommsMerge(void ) { + template + void CommsMerge(decompressor decompress) { + // Also do a precision convert possibly on a receive buffer for(int i=0;i_ndimension;d++){ - // ss<<"."<<_grid->_processor_coor[d]; - // } - // ss<<"_m_"< > u_simd_send_buf_hide; - // std::vector > u_simd_recv_buf_hide; - // commVector u_send_buf_hide; - // commVector u_recv_buf_hide; - // These are used; either SHM objects or refs to the above symmetric heap vectors // depending on comms target cobj* u_recv_buf_p; cobj* u_send_buf_p; - std::vector new_simd_send_buf; - std::vector new_simd_recv_buf; - std::vector u_simd_send_buf; - std::vector u_simd_recv_buf; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; int u_comm_offset; int _unified_buffer_size; cobj *CommBuf(void) { return u_recv_buf_p; } - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// -#define TIMING_HACK -#ifdef TIMING_HACK - double jointime; - double gathertime; - double commtime; - double halogtime; - double mergetime; - double spintime; - double comms_bytes; - double gathermtime; - double splicetime; - double nosplicetime; - double t_data; - double t_table; - double calls; - - void ZeroCounters(void) { - gathertime = 0.; - jointime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - spintime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - t_data = 0.0; - t_table= 0.0; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { -#define PRINTIT(A) \ - std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; - RealD NN = _grid->NodeCount(); - - _grid->GlobalSum(commtime); commtime/=NP; - if ( calls > 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); -#ifdef NEW_XYZT_GATHER + for(int l=0;l<2;l++){ - new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); - new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } -#else - for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - } -#endif PrecomputeByteOffsets(); } @@ -611,7 +571,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal offnode = (comm_proc!= 0); } - int wraparound=0; if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -738,13 +697,11 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal template void HaloExchange(const Lattice &source,compressor &compress) { std::vector > reqs; - calls++; - Mergers.resize(0); - Packets.resize(0); + Prepare(); HaloGather(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - CommsMerge(); + CommunicateBegin(reqs); + CommunicateComplete(reqs); + CommsMerge(compress); } template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) @@ -775,13 +732,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if ( sshift[0] == sshift[1] ) { if (splice_dim) { splicetime-=usecond(); - // GatherSimd(source,dimension,shift,0x3,compress,face_idx); - // std::cout << "GatherSimdNew"<>1; - int bytes = words * sizeof(cobj); + int bytes = words * compress.CommDatumSize(); - gathertime-=usecond(); int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { - t_table-=usecond(); face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); - // std::cout << " face table size "<CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x= rd ); - - if ( any_offnode ) { - - for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); - gathermtime+=usecond(); - - for(int i=0;i2 - // for(int w=0;w : lane " << i <<" elem "<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - int nbr_lcoor= (nbr_coor%ld); - int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer - int nbr_ox = (nbr_lcoor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - - if(nbr_proc){ - - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - // shm == receive pointer if offnode - // shm == Translate[send pointer] if on node -- my view of his send pointer - scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { - shm = rp; - } - - // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node - // assuming above pointer flip - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - rpointers[i] = shm; - - } else { - - rpointers[i] = sp; - - } - } - - AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); - - u_comm_offset +=buffer_size; - } - } - } - - - template - void GatherSimdNew(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) - { - const int Nsimd = _grid->Nsimd(); - const int maxl =2;// max layout in a direction int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1085,25 +923,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal for(int i=0;i > table; - t_table-=usecond(); if ( !face_table_computed ) { face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); - // std::cout << " face table size "<Barrier(); @@ -206,6 +204,33 @@ int main (int argc, char ** argv) Dw.Report(); } + DomainWallFermionRL DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + if (1) { + FGrid->Barrier(); + DwH.ZeroCounters(); + DwH.Dhop(src,result,0); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu::MooeeInvDag (const FermionField &psi, FermionField & INSTANTIATE_DPERP(GparityWilsonImplD); INSTANTIATE_DPERP(ZWilsonImplF); INSTANTIATE_DPERP(ZWilsonImplD); + + INSTANTIATE_DPERP(WilsonImplFH); + INSTANTIATE_DPERP(WilsonImplDF); + INSTANTIATE_DPERP(GparityWilsonImplFH); + INSTANTIATE_DPERP(GparityWilsonImplDF); + INSTANTIATE_DPERP(ZWilsonImplFH); + INSTANTIATE_DPERP(ZWilsonImplDF); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc index 16fb47bb..6b53d540 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc @@ -137,6 +137,20 @@ template void CayleyFermion5D::MooeeInternal(const FermionField &ps template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + +INSTANTIATE_DPERP(GparityWilsonImplFH); +INSTANTIATE_DPERP(GparityWilsonImplDF); +INSTANTIATE_DPERP(WilsonImplFH); +INSTANTIATE_DPERP(WilsonImplDF); +INSTANTIATE_DPERP(ZWilsonImplFH); +INSTANTIATE_DPERP(ZWilsonImplDF); + +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc index a83b962e..cb9b2957 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc @@ -37,7 +37,6 @@ namespace Grid { namespace QCD { // FIXME -- make a version of these routines with site loop outermost for cache reuse. - // Pminus fowards // Pplus backwards template @@ -152,6 +151,13 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & INSTANTIATE_DPERP(GparityWilsonImplD); INSTANTIATE_DPERP(ZWilsonImplF); INSTANTIATE_DPERP(ZWilsonImplD); + + INSTANTIATE_DPERP(WilsonImplFH); + INSTANTIATE_DPERP(WilsonImplDF); + INSTANTIATE_DPERP(GparityWilsonImplFH); + INSTANTIATE_DPERP(GparityWilsonImplDF); + INSTANTIATE_DPERP(ZWilsonImplFH); + INSTANTIATE_DPERP(ZWilsonImplDF); #endif } diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 566624e3..653e6ab3 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -808,10 +808,21 @@ INSTANTIATE_DPERP(DomainWallVec5dImplF); INSTANTIATE_DPERP(ZDomainWallVec5dImplD); INSTANTIATE_DPERP(ZDomainWallVec5dImplF); +INSTANTIATE_DPERP(DomainWallVec5dImplDF); +INSTANTIATE_DPERP(DomainWallVec5dImplFH); +INSTANTIATE_DPERP(ZDomainWallVec5dImplDF); +INSTANTIATE_DPERP(ZDomainWallVec5dImplFH); + template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + + }} diff --git a/lib/qcd/action/fermion/Fermion.h b/lib/qcd/action/fermion/Fermion.h index ac42f177..94fbae6c 100644 --- a/lib/qcd/action/fermion/Fermion.h +++ b/lib/qcd/action/fermion/Fermion.h @@ -89,6 +89,10 @@ typedef WilsonFermion WilsonFermionR; typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; +typedef WilsonFermion WilsonFermionRL; +typedef WilsonFermion WilsonFermionFH; +typedef WilsonFermion WilsonFermionDF; + typedef WilsonFermion WilsonAdjFermionR; typedef WilsonFermion WilsonAdjFermionF; typedef WilsonFermion WilsonAdjFermionD; @@ -105,27 +109,50 @@ typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; +typedef DomainWallFermion DomainWallFermionRL; +typedef DomainWallFermion DomainWallFermionFH; +typedef DomainWallFermion DomainWallFermionDF; + typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; +typedef MobiusFermion MobiusFermionRL; +typedef MobiusFermion MobiusFermionFH; +typedef MobiusFermion MobiusFermionDF; + typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; +typedef ZMobiusFermion ZMobiusFermionRL; +typedef ZMobiusFermion ZMobiusFermionFH; +typedef ZMobiusFermion ZMobiusFermionDF; + // Ls vectorised typedef DomainWallFermion DomainWallFermionVec5dR; typedef DomainWallFermion DomainWallFermionVec5dF; typedef DomainWallFermion DomainWallFermionVec5dD; +typedef DomainWallFermion DomainWallFermionVec5dRL; +typedef DomainWallFermion DomainWallFermionVec5dFH; +typedef DomainWallFermion DomainWallFermionVec5dDF; + typedef MobiusFermion MobiusFermionVec5dR; typedef MobiusFermion MobiusFermionVec5dF; typedef MobiusFermion MobiusFermionVec5dD; +typedef MobiusFermion MobiusFermionVec5dRL; +typedef MobiusFermion MobiusFermionVec5dFH; +typedef MobiusFermion MobiusFermionVec5dDF; + typedef ZMobiusFermion ZMobiusFermionVec5dR; typedef ZMobiusFermion ZMobiusFermionVec5dF; typedef ZMobiusFermion ZMobiusFermionVec5dD; +typedef ZMobiusFermion ZMobiusFermionVec5dRL; +typedef ZMobiusFermion ZMobiusFermionVec5dFH; +typedef ZMobiusFermion ZMobiusFermionVec5dDF; typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; @@ -166,17 +193,35 @@ typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonP typedef WilsonFermion GparityWilsonFermionR; typedef WilsonFermion GparityWilsonFermionF; typedef WilsonFermion GparityWilsonFermionD; + +typedef WilsonFermion GparityWilsonFermionRL; +typedef WilsonFermion GparityWilsonFermionFH; +typedef WilsonFermion GparityWilsonFermionDF; + typedef DomainWallFermion GparityDomainWallFermionR; typedef DomainWallFermion GparityDomainWallFermionF; typedef DomainWallFermion GparityDomainWallFermionD; +typedef DomainWallFermion GparityDomainWallFermionRL; +typedef DomainWallFermion GparityDomainWallFermionFH; +typedef DomainWallFermion GparityDomainWallFermionDF; + typedef WilsonTMFermion GparityWilsonTMFermionR; typedef WilsonTMFermion GparityWilsonTMFermionF; typedef WilsonTMFermion GparityWilsonTMFermionD; + +typedef WilsonTMFermion GparityWilsonTMFermionRL; +typedef WilsonTMFermion GparityWilsonTMFermionFH; +typedef WilsonTMFermion GparityWilsonTMFermionDF; + typedef MobiusFermion GparityMobiusFermionR; typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; +typedef MobiusFermion GparityMobiusFermionRL; +typedef MobiusFermion GparityMobiusFermionFH; +typedef MobiusFermion GparityMobiusFermionDF; + typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; diff --git a/lib/qcd/action/fermion/FermionCore.h b/lib/qcd/action/fermion/FermionCore.h index 74d94d67..17006961 100644 --- a/lib/qcd/action/fermion/FermionCore.h +++ b/lib/qcd/action/fermion/FermionCore.h @@ -55,7 +55,14 @@ Author: Peter Boyle template class A; \ template class A; \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; + #define AdjointFermOpTemplateInstantiate(A) \ template class A; \ @@ -69,7 +76,11 @@ Author: Peter Boyle template class A; \ template class A; \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; \ + template class A; \ + template class A; #define FermOpTemplateInstantiate(A) \ FermOp4dVecTemplateInstantiate(A) \ diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 0f318366..f6b47063 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -874,17 +874,17 @@ typedef WilsonImpl WilsonImplR typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double -typedef WilsonImpl WilsonImplRH; // Real.. whichever prec +typedef WilsonImpl WilsonImplRL; // Real.. whichever prec typedef WilsonImpl WilsonImplFH; // Float -typedef WilsonImpl WilsonImplDH; // Double +typedef WilsonImpl WilsonImplDF; // Double typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl ZWilsonImplF; // Float typedef WilsonImpl ZWilsonImplD; // Double -typedef WilsonImpl ZWilsonImplRH; // Real.. whichever prec +typedef WilsonImpl ZWilsonImplRL; // Real.. whichever prec typedef WilsonImpl ZWilsonImplFH; // Float -typedef WilsonImpl ZWilsonImplDH; // Double +typedef WilsonImpl ZWilsonImplDF; // Double typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec typedef WilsonImpl WilsonAdjImplF; // Float @@ -898,25 +898,25 @@ typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Re typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double -typedef DomainWallVec5dImpl DomainWallVec5dImplRH; // Real.. whichever prec +typedef DomainWallVec5dImpl DomainWallVec5dImplRL; // Real.. whichever prec typedef DomainWallVec5dImpl DomainWallVec5dImplFH; // Float -typedef DomainWallVec5dImpl DomainWallVec5dImplDH; // Double +typedef DomainWallVec5dImpl DomainWallVec5dImplDF; // Double typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double -typedef DomainWallVec5dImpl ZDomainWallVec5dImplRH; // Real.. whichever prec +typedef DomainWallVec5dImpl ZDomainWallVec5dImplRL; // Real.. whichever prec typedef DomainWallVec5dImpl ZDomainWallVec5dImplFH; // Float -typedef DomainWallVec5dImpl ZDomainWallVec5dImplDH; // Double +typedef DomainWallVec5dImpl ZDomainWallVec5dImplDF; // Double typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec typedef GparityWilsonImpl GparityWilsonImplF; // Float typedef GparityWilsonImpl GparityWilsonImplD; // Double -typedef GparityWilsonImpl GparityWilsonImplRH; // Real.. whichever prec +typedef GparityWilsonImpl GparityWilsonImplRL; // Real.. whichever prec typedef GparityWilsonImpl GparityWilsonImplFH; // Float -typedef GparityWilsonImpl GparityWilsonImplDH; // Double +typedef GparityWilsonImpl GparityWilsonImplDF; // Double typedef StaggeredImpl StaggeredImplR; // Real.. whichever prec typedef StaggeredImpl StaggeredImplF; // Float diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d29142d9..d021b35c 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -679,7 +679,6 @@ void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const Fe } - FermOpTemplateInstantiate(WilsonFermion5D); GparityFermOpTemplateInstantiate(WilsonFermion5D); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 365be69a..cd5d2430 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -112,5 +112,16 @@ INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplD); +INSTANTIATE_ASM(WilsonImplFH); +INSTANTIATE_ASM(WilsonImplDF); +INSTANTIATE_ASM(ZWilsonImplFH); +INSTANTIATE_ASM(ZWilsonImplDF); +INSTANTIATE_ASM(GparityWilsonImplFH); +INSTANTIATE_ASM(GparityWilsonImplDF); +INSTANTIATE_ASM(DomainWallVec5dImplFH); +INSTANTIATE_ASM(DomainWallVec5dImplDF); +INSTANTIATE_ASM(ZDomainWallVec5dImplFH); +INSTANTIATE_ASM(ZDomainWallVec5dImplDF); + }} diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 0a60c107..b7923b9f 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -829,6 +829,36 @@ WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder assert(0); } +template<> void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) +{ + assert(0); +} + +template<> void +WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) +{ + assert(0); +} + +template<> void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) +{ + assert(0); +} + +template<> void +WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) +{ + assert(0); +} + ////////////// Wilson ; uses this implementation ///////////////////// @@ -850,5 +880,15 @@ INSTANTIATE_THEM(DomainWallVec5dImplF); INSTANTIATE_THEM(DomainWallVec5dImplD); INSTANTIATE_THEM(ZDomainWallVec5dImplF); INSTANTIATE_THEM(ZDomainWallVec5dImplD); +INSTANTIATE_THEM(WilsonImplFH); +INSTANTIATE_THEM(WilsonImplDF); +INSTANTIATE_THEM(ZWilsonImplFH); +INSTANTIATE_THEM(ZWilsonImplDF); +INSTANTIATE_THEM(GparityWilsonImplFH); +INSTANTIATE_THEM(GparityWilsonImplDF); +INSTANTIATE_THEM(DomainWallVec5dImplFH); +INSTANTIATE_THEM(DomainWallVec5dImplDF); +INSTANTIATE_THEM(ZDomainWallVec5dImplFH); +INSTANTIATE_THEM(ZDomainWallVec5dImplDF); }} diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 268331fa..d11afd4b 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -901,8 +901,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int reduced_buffer_size = buffer_size; if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1; - int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout; - assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj)); + int datum_bytes = compress.CommDatumSize(); + int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; + assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); std::vector rpointers(maxl); std::vector spointers(maxl); diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 80009391..af82de72 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -142,6 +142,17 @@ namespace Grid { typedef vRealD Realified; enum { TensorLevel = 0 }; }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef vComplexH vector_type; + typedef vComplexD vector_typeD; + typedef vComplexH tensor_reduced; + typedef ComplexF scalar_object; + typedef vComplexH Complexified; + typedef vRealH Realified; + enum { TensorLevel = 0 }; + }; template<> class GridTypeMapper { public: typedef ComplexF scalar_type; From d2312e9874a12783dd85800054cb36482d576665 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 13:16:55 +0100 Subject: [PATCH 06/31] Drop compressor entirely from Cshift to only Stencil. --- lib/stencil/Stencil.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index d11afd4b..81138fc6 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -28,6 +28,7 @@ #ifndef GRID_STENCIL_H #define GRID_STENCIL_H +#include // subdir aggregate #include // subdir aggregate ////////////////////////////////////////////////////////////////////////////////////////// From 957a706d0b9a11e5d48633f474ed708bffca9155 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 13:17:44 +0100 Subject: [PATCH 07/31] Useful script --- scripts/grep-global | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 scripts/grep-global diff --git a/scripts/grep-global b/scripts/grep-global new file mode 100755 index 00000000..a95a0549 --- /dev/null +++ b/scripts/grep-global @@ -0,0 +1,6 @@ +#!/bin/bash + +export LANG=C +find . -name "*.cc" -exec grep -H $@ {} \; +find . -name "*.h" -exec grep -H $@ {} \; + From 180c732b4c2db59d5cd13903f1395ef261644805 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 13:17:55 +0100 Subject: [PATCH 08/31] Move compressors out of Cshift. Slice iterators would help --- lib/cshift/Cshift_common.h | 67 ++++++++------------------------------ lib/cshift/Cshift_mpi.h | 15 +-------- 2 files changed, 14 insertions(+), 68 deletions(-) diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 956f6cf1..1be672e8 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -30,33 +30,11 @@ Author: Peter Boyle namespace Grid { -template -class SimpleCompressor { -public: - void Point(int) {}; - inline int CommDatumSize(void) { return sizeof(vobj); } - inline bool DecompressionStep(void) { return false; } - inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; } - inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ - exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); - } - inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } - inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type){ - exchange(out0[j],out1[j],in[k],in[m],type); - } - // For cshift. Cshift should drop compressor coupling altogether - // because I had to decouple the code from the Stencil anyway - inline vobj operator() (const vobj &arg) { - return arg; - } -}; - /////////////////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split with compression +// Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// -template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) +template void +Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs._grid->_rdimensions[dimension]; @@ -74,7 +52,7 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen for(int b=0;b &rhs,commVector &buffer,int dimen } } parallel_for(int i=0;i void -Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) +template void +Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; @@ -121,8 +98,8 @@ Gather_plane_extract(const Lattice &rhs,std::vector(temp,pointers,offset); + vobj temp =rhs._odata[so+o+b]; + extract(temp,pointers,offset); } } @@ -139,32 +116,14 @@ Gather_plane_extract(const Lattice &rhs,std::vector(temp,pointers,offset); + vobj temp =rhs._odata[so+o+b]; + extract(temp,pointers,offset); } } } } } -////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split -////////////////////////////////////////////////////// -template void Gather_plane_simple (const Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) -{ - SimpleCompressor dontcompress; - Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); -} - -////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split -////////////////////////////////////////////////////// -template void Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) -{ - SimpleCompressor dontcompress; - Gather_plane_extract(rhs,pointers,dimension,plane,cbmask,dontcompress); -} - ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// @@ -212,7 +171,7 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) +template void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h index b2a44961..a66b49bf 100644 --- a/lib/cshift/Cshift_mpi.h +++ b/lib/cshift/Cshift_mpi.h @@ -154,13 +154,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); grid->Barrier(); - /* - for(int i=0;i void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { From e1a2319d016d6ec415067a16c512bf0404ed7c6c Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 13:18:15 +0100 Subject: [PATCH 09/31] Simple compressor moved out of cshift into stencil --- lib/stencil/SimpleCompressor.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 lib/stencil/SimpleCompressor.h diff --git a/lib/stencil/SimpleCompressor.h b/lib/stencil/SimpleCompressor.h new file mode 100644 index 00000000..58c3bcd2 --- /dev/null +++ b/lib/stencil/SimpleCompressor.h @@ -0,0 +1,29 @@ +#ifndef _STENCIL_SIMPLE_COMPRESSOR_H_ +#define _STENCIL_SIMPLE_COMPRESSOR_H_ + +namespace Grid { + +template +class SimpleCompressor { +public: + void Point(int) {}; + inline int CommDatumSize(void) { return sizeof(vobj); } + inline bool DecompressionStep(void) { return false; } + inline void Compress(vobj *buf,int o,const vobj &in) { buf[o]=in; } + inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + } + inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, + int j,int k, int m,int type){ + exchange(out0[j],out1[j],in[k],in[m],type); + } + // For cshift. Cshift should drop compressor coupling altogether + // because I had to decouple the code from the Stencil anyway + inline vobj operator() (const vobj &arg) { + return arg; + } +}; + +} +#endif From 3844bcf80065f878c35c900acea2d78bec3bc511 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 20 Apr 2017 15:30:52 +0100 Subject: [PATCH 10/31] If no f16c instructions supported must use software half precision conversion. This will also become useful on BG/Q, so will move out from SSE4 into a general area. Lifted the Eigen half precision from web. Looks sensible, but not extensively regressed against the intrinsics implementation yet. --- configure.ac | 26 ++++++---- lib/simd/Grid_sse4.h | 111 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 113 insertions(+), 24 deletions(-) diff --git a/configure.ac b/configure.ac index f18beb32..81d38ae9 100644 --- a/configure.ac +++ b/configure.ac @@ -84,16 +84,15 @@ case ${ac_LAPACK} in esac ############### FP16 conversions -AC_ARG_ENABLE([fp16], - [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], - [ac_FP16=${enable_fp16}], [ac_FP16=no]) -case ${ac_FP16} in - no) - ;; +AC_ARG_ENABLE([sfw-fp16], + [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])], + [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes]) +case ${ac_SFW_FP16} in yes) - AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; + AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);; + no);; *) - ;; + AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac ############### MKL @@ -189,7 +188,14 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2';; + case ${ac_SFW_FP16} in + yes) + SIMD_FLAGS='-msse4.2';; + no) + SIMD_FLAGS='-msse4.2 -mf16c';; + *) + AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);; + esac;; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; @@ -423,7 +429,6 @@ AC_OUTPUT echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Summary of configuration for $PACKAGE v$VERSION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ----- PLATFORM ---------------------------------------- architecture (build) : $build_cpu os (build) : $build_os @@ -436,6 +441,7 @@ SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} Threading : ${ac_openmp} Communications type : ${comms_type} Default precision : ${ac_PRECISION} +Software FP16 conversion : ${ac_SFW_FP16} RNG choice : ${ac_RNG} GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` LAPACK : ${ac_LAPACK} diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 8a4537c2..59ad996a 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -38,7 +38,6 @@ Author: neo #include - namespace Grid { namespace Optimization { @@ -333,26 +332,110 @@ namespace Optimization { #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) +#ifdef SFW_FP16 + + struct Grid_half { + Grid_half(){} + Grid_half(uint16_t raw) : x(raw) {} + uint16_t x; + }; + union FP32 { + unsigned int u; + float f; + }; + + // PAB - Lifted and adapted from Eigen, which is GPL V2 + inline float sfw_half_to_float(Grid_half h) { + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; + } + inline Grid_half sfw_float_to_half(float ff) { + FP32 f; f.f = ff; + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + Grid_half o; + + o.x = static_cast(0x0u); + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + o.x |= static_cast(sign >> 16); + return o; + } + static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) { + __m128i ret=(__m128i)_mm_setzero_ps(); + float *fp = (float *)&f; + Grid_half *hp = (Grid_half *)&ret; + hp[0] = sfw_float_to_half(fp[0]); + hp[1] = sfw_float_to_half(fp[1]); + hp[2] = sfw_float_to_half(fp[2]); + hp[3] = sfw_float_to_half(fp[3]); + return ret; + } + static inline __m128i Grid_mm_cvtph_ps(__m128i h,int discard) { + __m128 ret=_mm_setzero_ps(); + float *fp = (float *)&ret; + Grid_half *hp = (Grid_half *)&h; + fp[0] = sfw_half_to_float(hp[0]); + fp[1] = sfw_half_to_float(hp[1]); + fp[2] = sfw_half_to_float(hp[2]); + fp[3] = sfw_half_to_float(hp[3]); + return ret; + } +#else +#define Grid_mm_cvtps_ph _mm_cvtps_ph +#define Grid_mm_cvtph_ps _mm_cvtph_ps +#endif struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { -#ifdef USE_FP16 - __m128i ha = _mm_cvtps_ph(a,0); - __m128i hb = _mm_cvtps_ph(b,0); + __m128i ha = Grid_mm_cvtps_ph(a,0); + __m128i hb = Grid_mm_cvtps_ph(b,0); __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); -#else - __m128i h = (__m128i)a; - assert(0); -#endif return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { -#ifdef USE_FP16 - sa = _mm_cvtph_ps(h); + sa = Grid_mm_cvtph_ps(h,0); h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); - sb = _mm_cvtph_ps(h); -#else - assert(0); -#endif + sb = Grid_mm_cvtph_ps(h,0); } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a); From b9bbe5d188ca785c4fee6f8d3a3f7370b6db7aba Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 11:33:09 +0100 Subject: [PATCH 11/31] L1p config bg/q --- lib/simd/l1p.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 lib/simd/l1p.h diff --git a/lib/simd/l1p.h b/lib/simd/l1p.h new file mode 100644 index 00000000..8e43fdbd --- /dev/null +++ b/lib/simd/l1p.h @@ -0,0 +1,37 @@ +#pragma once +namespace Grid { +// L1p optimisation +inline void bgq_l1p_optimisation(int mode) +{ +#ifdef QPX +#undef L1P_CFG_PF_USR +#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ + + uint64_t cfg_pf_usr; + if ( mode ) { + cfg_pf_usr = + L1P_CFG_PF_USR_ifetch_depth(0) + | L1P_CFG_PF_USR_ifetch_max_footprint(1) + | L1P_CFG_PF_USR_pf_stream_est_on_dcbt + | L1P_CFG_PF_USR_pf_stream_establish_enable + | L1P_CFG_PF_USR_pf_stream_optimistic + | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; + // if ( sizeof(Float) == sizeof(double) ) { + cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; + // } else { + // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; + // } + } else { + cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) + | L1P_CFG_PF_USR_dfetch_max_footprint(2) + | L1P_CFG_PF_USR_ifetch_depth(0) + | L1P_CFG_PF_USR_ifetch_max_footprint(1) + | L1P_CFG_PF_USR_pf_stream_est_on_dcbt + | L1P_CFG_PF_USR_pf_stream_establish_enable + | L1P_CFG_PF_USR_pf_stream_optimistic + | L1P_CFG_PF_USR_pf_stream_prefetch_enable; + } + *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; +#endif +} +} From 736bf3c8664f83b613a8160734eb1fc58e3c0400 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 11:33:50 +0100 Subject: [PATCH 12/31] Major rework of stencil. Half precision and MPI3 now working. --- lib/qcd/action/fermion/WilsonCompressor.h | 42 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 52 +- lib/qcd/action/fermion/WilsonKernels.cc | 680 ++++++------------- lib/qcd/action/fermion/WilsonKernels.h | 69 +- lib/qcd/action/fermion/WilsonKernelsHand.cc | 618 ++++------------- lib/simd/Grid_qpx.h | 8 + lib/simd/Grid_sse4.h | 2 +- lib/simd/Grid_vector_types.h | 8 +- lib/stencil/Stencil.h | 692 ++++++++++---------- 9 files changed, 787 insertions(+), 1384 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index f1ad390d..59fcc1f8 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -241,13 +241,18 @@ public: typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + std::vector same_node; + WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances) - : CartesianStencil (grid,npoints,checkerboard,directions,distances) - { /*Do nothing*/ }; + : CartesianStencil (grid,npoints,checkerboard,directions,distances) , + same_node(npoints) + { + assert(npoints==8);// or 10 if do naive DWF 5d red black ? + }; template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) @@ -257,6 +262,7 @@ public: this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); this->CommsMerge(compress); + this->CommsMergeSHM(compress); } template @@ -295,23 +301,23 @@ public: int face_idx=0; if ( dag ) { // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx); - this->HaloGatherDir(source,YpCompress,Yp,face_idx); - this->HaloGatherDir(source,ZpCompress,Zp,face_idx); - this->HaloGatherDir(source,TpCompress,Tp,face_idx); - this->HaloGatherDir(source,XmCompress,Xm,face_idx); - this->HaloGatherDir(source,YmCompress,Ym,face_idx); - this->HaloGatherDir(source,ZmCompress,Zm,face_idx); - this->HaloGatherDir(source,TmCompress,Tm,face_idx); + same_node[Xp]=this->HaloGatherDir(source,XpCompress,Xp,face_idx); + same_node[Yp]=this->HaloGatherDir(source,YpCompress,Yp,face_idx); + same_node[Zp]=this->HaloGatherDir(source,ZpCompress,Zp,face_idx); + same_node[Tp]=this->HaloGatherDir(source,TpCompress,Tp,face_idx); + same_node[Xm]=this->HaloGatherDir(source,XmCompress,Xm,face_idx); + same_node[Ym]=this->HaloGatherDir(source,YmCompress,Ym,face_idx); + same_node[Zm]=this->HaloGatherDir(source,ZmCompress,Zm,face_idx); + same_node[Tm]=this->HaloGatherDir(source,TmCompress,Tm,face_idx); } else { - this->HaloGatherDir(source,XmCompress,Xp,face_idx); - this->HaloGatherDir(source,YmCompress,Yp,face_idx); - this->HaloGatherDir(source,ZmCompress,Zp,face_idx); - this->HaloGatherDir(source,TmCompress,Tp,face_idx); - this->HaloGatherDir(source,XpCompress,Xm,face_idx); - this->HaloGatherDir(source,YpCompress,Ym,face_idx); - this->HaloGatherDir(source,ZpCompress,Zm,face_idx); - this->HaloGatherDir(source,TpCompress,Tm,face_idx); + same_node[Xp]=this->HaloGatherDir(source,XmCompress,Xp,face_idx); + same_node[Yp]=this->HaloGatherDir(source,YmCompress,Yp,face_idx); + same_node[Zp]=this->HaloGatherDir(source,ZmCompress,Zp,face_idx); + same_node[Tp]=this->HaloGatherDir(source,TmCompress,Tp,face_idx); + same_node[Xm]=this->HaloGatherDir(source,XpCompress,Xm,face_idx); + same_node[Ym]=this->HaloGatherDir(source,YpCompress,Ym,face_idx); + same_node[Zm]=this->HaloGatherDir(source,ZpCompress,Zm,face_idx); + same_node[Tm]=this->HaloGatherDir(source,TpCompress,Tm,face_idx); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d021b35c..9d325ed5 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -118,48 +118,6 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // Allocate the required comms buffer ImportGauge(_Umu); } - /* -template -WilsonFermion5D::WilsonFermion5D(int simd,GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - RealD _M5,const ImplParams &p) : -{ - int nsimd = Simd::Nsimd(); - - // some assertions - assert(FiveDimGrid._ndimension==5); - assert(FiveDimRedBlackGrid._ndimension==5); - assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction - assert(FourDimGrid._ndimension==4); - - // Dimension zero of the five-d is the Ls direction - Ls=FiveDimGrid._fdimensions[0]; - assert(FiveDimGrid._processors[0] ==1); - assert(FiveDimGrid._simd_layout[0] ==nsimd); - - assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); - assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); - - // Other dimensions must match the decomposition of the four-D fields - for(int d=0;d<4;d++){ - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - - assert(FourDimGrid._simd_layout[d]=1); - assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); - assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); - } - - { - } -} - */ template void WilsonFermion5D::Report(void) @@ -415,6 +373,10 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime+=usecond(); std::vector > reqs; + // Rely on async comms; start comms before merge of local data + st.CommunicateBegin(reqs); + st.CommsMergeSHM(compressor); + #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -426,7 +388,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg if ( me == 0 ) { DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); st.CommunicateComplete(reqs); DhopCommTime+=usecond(); } else { @@ -442,10 +403,13 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommsMerge(compressor); DhopFaceTime+=usecond(); + // Load imbalance alert. Should use dynamic schedule OMP for loop + // Perhaps create a list of only those sites with face work, and + // load balance process the list. #pragma omp parallel { int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); + int me = omp_get_thread_num(); int myoff, mywork; GridThread::GetWork(len,me,mywork,myoff,nthreads); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 6e72e089..6ed350cf 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -36,62 +36,78 @@ namespace QCD { int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; -#ifdef QPX -#include -#include -#include -#include -#endif - -void bgq_l1p_optimisation(int mode) -{ -#ifdef QPX -#undef L1P_CFG_PF_USR -#define L1P_CFG_PF_USR (0x3fde8000108ll) /* (64 bit reg, 23 bits wide, user/unpriv) */ - - uint64_t cfg_pf_usr; - if ( mode ) { - cfg_pf_usr = - L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_adaptive_throttle(0xF) ; - // if ( sizeof(Float) == sizeof(double) ) { - cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(2)| L1P_CFG_PF_USR_dfetch_max_footprint(3) ; - // } else { - // cfg_pf_usr |= L1P_CFG_PF_USR_dfetch_depth(1)| L1P_CFG_PF_USR_dfetch_max_footprint(2) ; - // } - } else { - cfg_pf_usr = L1P_CFG_PF_USR_dfetch_depth(1) - | L1P_CFG_PF_USR_dfetch_max_footprint(2) - | L1P_CFG_PF_USR_ifetch_depth(0) - | L1P_CFG_PF_USR_ifetch_max_footprint(1) - | L1P_CFG_PF_USR_pf_stream_est_on_dcbt - | L1P_CFG_PF_USR_pf_stream_establish_enable - | L1P_CFG_PF_USR_pf_stream_optimistic - | L1P_CFG_PF_USR_pf_stream_prefetch_enable; - } - *((uint64_t *)L1P_CFG_PF_USR) = cfg_pf_usr; - -#endif - -} - - template WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// // Generic implementation; move to different file? //////////////////////////////////////////// + +#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); + +#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if (SE->_is_local) { \ + chi_p = χ \ + if (SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else { \ + spProj(chi, in._odata[SE->_offset]); \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + } +#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ + SE = st.GetEntry(ptype, Dir, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + chi_p = &buf[SE->_offset]; \ + Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ + } + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + if (SE->_is_local && SE->_permute) { \ + spProj(tmp, in._odata[SE->_offset]); \ + permute(chi, tmp, ptype); \ + } else if (SE->_is_local) { \ + spProj(chi, in._odata[SE->_offset]); \ + } else { \ + chi = buf[SE->_offset]; \ + } \ + Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); \ + Recon(result, Uchi); \ + } + + //////////////////////////////////////////////////////////////////// + // All legs kernels ; comms then compute + //////////////////////////////////////////////////////////////////// template void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out, - int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -100,174 +116,22 @@ void WilsonKernels::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; -// Need controls to do interior, exterior, or both template void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionField &in, FermionField &out,int interior,int exterior) { + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -276,168 +140,123 @@ void WilsonKernels::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, Do StencilEntry *SE; int ptype; - /////////////////////////// - // Xp - /////////////////////////// - SE = st.GetEntry(ptype, Xm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st); - spReconXp(result, Uchi); - - /////////////////////////// - // Yp - /////////////////////////// - SE = st.GetEntry(ptype, Ym, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st); - accumReconYp(result, Uchi); - - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st); - accumReconZp(result, Uchi); - - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tm, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTp(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st); - accumReconTp(result, Uchi); - - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjXm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st); - accumReconXm(result, Uchi); - - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Yp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjYm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st); - accumReconYm(result, Uchi); - - /////////////////////////// - // Zm - /////////////////////////// - SE = st.GetEntry(ptype, Zp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjZm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st); - accumReconZm(result, Uchi); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tp, sF); - - if (SE->_is_local) { - chi_p = χ - if (SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else { - spProjTm(chi, in._odata[SE->_offset]); - } - } else { - chi_p = &buf[SE->_offset]; - } - - Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st); - accumReconTm(result, Uchi); - + GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); + GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm); vstream(out._odata[sF], result); }; + //////////////////////////////////////////////////////////////////// + // Interior kernels + //////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + + result=zero; + GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; + +template +void WilsonKernels::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + result=zero; + GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm); + vstream(out._odata[sF], result); +}; +//////////////////////////////////////////////////////////////////// +// Exterior kernels +//////////////////////////////////////////////////////////////////// +template +void WilsonKernels::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; + +template +void WilsonKernels::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) +{ + SiteHalfSpinor tmp; + SiteHalfSpinor chi; + SiteHalfSpinor *chi_p; + SiteHalfSpinor Uchi; + SiteSpinor result; + StencilEntry *SE; + int ptype; + int nmu=0; + result=zero; + GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); + GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); + GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp); + GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp); + GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm); + GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm); + GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm); + GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm); + if ( nmu ) { + out._odata[sF] = out._odata[sF] + result; + } +}; template void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, @@ -451,119 +270,14 @@ void WilsonKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal int ptype; SE = st.GetEntry(ptype, dir, sF); - - // Xp - if (gamma == Xp) { - if (SE->_is_local && SE->_permute) { - spProjXp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - - // Yp - if (gamma == Yp) { - if (SE->_is_local && SE->_permute) { - spProjYp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYp(result, Uchi); - } - - // Zp - if (gamma == Zp) { - if (SE->_is_local && SE->_permute) { - spProjZp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZp(result, Uchi); - } - - // Tp - if (gamma == Tp) { - if (SE->_is_local && SE->_permute) { - spProjTp(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTp(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTp(result, Uchi); - } - - // Xm - if (gamma == Xm) { - if (SE->_is_local && SE->_permute) { - spProjXm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjXm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconXm(result, Uchi); - } - - // Ym - if (gamma == Ym) { - if (SE->_is_local && SE->_permute) { - spProjYm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjYm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconYm(result, Uchi); - } - - // Zm - if (gamma == Zm) { - if (SE->_is_local && SE->_permute) { - spProjZm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjZm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconZm(result, Uchi); - } - - // Tm - if (gamma == Tm) { - if (SE->_is_local && SE->_permute) { - spProjTm(tmp, in._odata[SE->_offset]); - permute(chi, tmp, ptype); - } else if (SE->_is_local) { - spProjTm(chi, in._odata[SE->_offset]); - } else { - chi = buf[SE->_offset]; - } - Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); - spReconTm(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); + GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); + GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); + GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); + GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm); + GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm); + GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm); + GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm); vstream(out._odata[sF], result); } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 20ee87f2..e0aa08b0 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -34,8 +34,6 @@ directory namespace Grid { namespace QCD { -void bgq_l1p_optimisation(int mode); - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Helper routines that implement Wilson stencil for a single site. // Common to both the WilsonFermion and WilsonFermion5D @@ -44,9 +42,8 @@ class WilsonKernelsStatic { public: enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { CommsAndCompute, CommsThenCompute }; - // S-direction is INNERMOST and takes no part in the parity. - static int Opt; // these are a temporary hack - static int Comms; // these are a temporary hack + static int Opt; + static int Comms; }; template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { @@ -75,7 +72,7 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -84,7 +81,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -99,11 +99,14 @@ public: template typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) { // no kernel choice for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSite(st, lo, U, buf, sF, sU, in, out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -113,8 +116,8 @@ public: template typename std::enable_if::type DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) { - + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) +{ bgq_l1p_optimisation(1); switch(Opt) { #if defined(AVX512) || defined (QPX) @@ -128,7 +131,7 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -137,7 +140,10 @@ public: case OptGeneric: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -156,7 +162,10 @@ public: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out,interior,exterior); + if(interior&&exterior) WilsonKernels::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; @@ -169,35 +178,47 @@ public: private: // Specialised variants void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out,int interior,int exterior); + int sF, int sU, const FermionField &in, FermionField &out); public: diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index b7923b9f..f12d94c7 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -307,55 +307,106 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -namespace Grid { -namespace QCD { +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + { \ + MULT_2SPIN(DIR); \ + } \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,Dir,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if((!SE->_is_local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out._odata[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + } -template void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) -{ - typedef typename Simd::scalar_type S; - typedef typename Simd::vector_type V; - - REGISTER Simd result_00; // 12 regs on knc - REGISTER Simd result_01; - REGISTER Simd result_02; - - REGISTER Simd result_10; - REGISTER Simd result_11; - REGISTER Simd result_12; - - REGISTER Simd result_20; - REGISTER Simd result_21; - REGISTER Simd result_22; - - REGISTER Simd result_30; - REGISTER Simd result_31; - REGISTER Simd result_32; // 20 left - - REGISTER Simd Chi_00; // two spinor; 6 regs - REGISTER Simd Chi_01; - REGISTER Simd Chi_02; - - REGISTER Simd Chi_10; - REGISTER Simd Chi_11; - REGISTER Simd Chi_12; // 14 left - - REGISTER Simd UChi_00; // two spinor; 6 regs - REGISTER Simd UChi_01; - REGISTER Simd UChi_02; - - REGISTER Simd UChi_10; - REGISTER Simd UChi_11; - REGISTER Simd UChi_12; // 8 left - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -370,430 +421,54 @@ WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge #define Chimu_31 UChi_11 #define Chimu_32 UChi_12 +namespace Grid { +namespace QCD { + +template void +WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); int offset,local,perm, ptype; StencilEntry *SE; - // Xp - SE=st.GetEntry(ptype,Xp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xp); - } - XM_RECON; - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YM_RECON_ACCUM; - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZM_RECON_ACCUM; - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TM_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XP_RECON_ACCUM; - - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YP_RECON_ACCUM; - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZP_RECON_ACCUM; - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TP_RECON_ACCUM; - - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); } template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior) + int ss,int sU,const FermionField &in, FermionField &out) { - // std::cout << "Hand op Dhop "<_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XP_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - - { - MULT_2SPIN(Xp); - } - XP_RECON; - - // Yp - SE=st.GetEntry(ptype,Yp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YP_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Yp); - } - YP_RECON_ACCUM; - - - // Zp - SE=st.GetEntry(ptype,Zp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZP_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zp); - } - ZP_RECON_ACCUM; - - // Tp - SE=st.GetEntry(ptype,Tp,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TP_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tp); - } - TP_RECON_ACCUM; - - // Xm - SE=st.GetEntry(ptype,Xm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - XM_PROJ; - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Xm); - } - XM_RECON_ACCUM; - - // Ym - SE=st.GetEntry(ptype,Ym,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - YM_PROJ; - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Ym); - } - YM_RECON_ACCUM; - - // Zm - SE=st.GetEntry(ptype,Zm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - ZM_PROJ; - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Zm); - } - ZM_RECON_ACCUM; - - // Tm - SE=st.GetEntry(ptype,Tm,ss); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHIMU; - TM_PROJ; - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI; - } - { - MULT_2SPIN(Tm); - } - TM_RECON_ACCUM; - - { - SiteSpinor & ref (out._odata[ss]); - vstream(ref()(0)(0),result_00); - vstream(ref()(0)(1),result_01); - vstream(ref()(0)(2),result_02); - vstream(ref()(1)(0),result_10); - vstream(ref()(1)(1),result_11); - vstream(ref()(1)(2),result_12); - vstream(ref()(2)(0),result_20); - vstream(ref()(2)(1),result_21); - vstream(ref()(2)(2),result_22); - vstream(ref()(3)(0),result_30); - vstream(ref()(3)(1),result_31); - vstream(ref()(3)(2),result_32); - } + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); } //////////////////////////////////////////////// @@ -801,74 +476,71 @@ void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub //////////////////////////////////////////////// template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } template<> void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out,int internal,int external) + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } - - ////////////// Wilson ; uses this implementation ///////////////////// -// Need Nc=3 though // #define INSTANTIATE_THEM(A) \ template void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); \ -template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ - int ss,int sU,const FermionField &in, FermionField &out,int interior,int exterior); + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 757c84c9..cbca9118 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -33,6 +33,14 @@ #include "Grid_generic_types.h" // Definitions for simulated integer SIMD. namespace Grid { + +#ifdef QPX +#include +#include +#include +#include +#endif + namespace Optimization { typedef struct { diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 59ad996a..2fb2df76 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -411,7 +411,7 @@ namespace Optimization { hp[3] = sfw_float_to_half(fp[3]); return ret; } - static inline __m128i Grid_mm_cvtph_ps(__m128i h,int discard) { + static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) { __m128 ret=_mm_setzero_ps(); float *fp = (float *)&ret; Grid_half *hp = (Grid_half *)&h; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index ac17d1e0..de4a13b5 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -53,12 +53,14 @@ directory #if defined IMCI #include "Grid_imci.h" #endif -#if defined QPX -#include "Grid_qpx.h" -#endif #ifdef NEONv8 #include "Grid_neon.h" #endif +#if defined QPX +#include "Grid_qpx.h" +#endif + +#include "l1p.h" namespace Grid { diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 81138fc6..32d1255a 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -36,35 +36,16 @@ // gather to a point stencil code. CSHIFT is not the best way, so need // additional stencil support. // - // Stencil based code will pre-exchange haloes and use a table lookup for neighbours. + // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. - // Overlap of comms and compute could be semi-automated by tabulating off-node connected, - // and - // - // Lattice could also allocate haloes which get used for stencil code. - // - // Grid could create a neighbour index table for a given stencil. - // - // Could also implement CovariantCshift, to fuse the loops and enhance performance. - // - // - // General stencil computation: - // + // Overlap of comms and compute is enabled by tabulating off-node connected, + // // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. - // // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each // - // Could take a - // SpinProjectFaces - // start comms - // complete comms - // Reconstruct Umu - // - // Approach. - // ////////////////////////////////////////////////////////////////////////////////////////// namespace Grid { @@ -108,27 +89,17 @@ void Gather_plane_exchange_table(std::vector >& table,const L } } -struct StencilEntry { - uint64_t _offset; - uint64_t _byte_offset; - uint16_t _is_local; - uint16_t _permute; - uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline -}; + struct StencilEntry { + uint64_t _offset; + uint64_t _byte_offset; + uint16_t _is_local; + uint16_t _permute; + uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline + }; -////////////////////////////////////////////////////////////////////////////////// -//Lattice object type, compressed object type. -// -//These only need to be known outside of halo exchange subset of methods -//because byte offsets are precomputed -// -//It might help/be cleaner to add a "mobj" for the mpi transferred object to this list -//for the on the wire representation. -// -//This would clean up the "casts" in the WilsonCompressor.h file -// -//Other compressors (SimpleCompressor) retains mobj == cobj so no issue -////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////// +// The Stencil Class itself +//////////////////////////////////////// template class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -139,6 +110,77 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; + /////////////////////////////////////////// + // Helper structs + /////////////////////////////////////////// + struct Packet { + void * send_buf; + void * recv_buf; + Integer to_rank; + Integer from_rank; + Integer bytes; + }; + struct Merge { + cobj * mpointer; + std::vector rpointers; + std::vector vpointers; + Integer buffer_size; + Integer type; + }; + struct Decompress { + cobj * kernel_p; + cobj * mpi_p; + Integer buffer_size; + }; + + //////////////////////////////////////// + // Basic Grid and stencil info + //////////////////////////////////////// + int face_table_computed; + std::vector > > face_table ; + + + int _checkerboard; + int _npoints; // Move to template param? + GridBase * _grid; + + // npoints of these + std::vector _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + Vector _entries; + std::vector Packets; + std::vector Mergers; + std::vector MergersSHM; + std::vector Decompressions; + std::vector DecompressionsSHM; + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + // Vectors that live on the symmetric heap in case of SHMEM + // These are used; either SHM objects or refs to the above symmetric heap vectors + // depending on comms target + cobj* u_recv_buf_p; + cobj* u_send_buf_p; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; + + int u_comm_offset; + int _unified_buffer_size; +/* + std::vector compute2sites; + const std::vector &Compute2Sites(void) { return compute2sites; } + void CalculateCompute2Sites(void) { + for(int i=0;i< _entries.size();i++){ + + } + } +*/ + cobj *CommBuf(void) { return u_recv_buf_p; } + ///////////////////////////////////////// // Timing info; ugly; possibly temporary ///////////////////////////////////////// @@ -152,193 +194,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; - - void ZeroCounters(void) { - gathertime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - decompresstime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { -#define AVERAGE(A) _grid->GlobalSum(A);A/=NP; -#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; - RealD NN = _grid->NodeCount(); - - _grid->GlobalSum(commtime); commtime/=NP; - if ( calls > 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"< Packets; - - int face_table_computed; - std::vector > > face_table ; - - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - Packets.push_back(p); - } - - void CommunicateBegin(std::vector > &reqs) - { - reqs.resize(Packets.size()); - commtime-=usecond(); - for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - } - commtime+=usecond(); - } - void CommunicateComplete(std::vector > &reqs) - { - commtime-=usecond(); - for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); - } - _grid->StencilBarrier();// Synch shared memory on a single nodes - commtime+=usecond(); - } - - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - std::vector vpointers; - Integer buffer_size; - Integer type; - }; - - std::vector Mergers; - - struct Decompress { - cobj * kernel_p; - cobj * mpi_p; - Integer buffer_size; - }; - - void Prepare(void) - { - Decompressions.resize(0); - Mergers.resize(0); - Packets.resize(0); - calls++; - } - std::vector Decompressions; - - void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size) { - Decompress d; - d.kernel_p = k_p; - d.mpi_p = m_p; - d.buffer_size = buffer_size; - Decompressions.push_back(d); - } - - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type) { - Merge m; - m.type = type; - m.mpointer = merge_p; - m.vpointers= rpointers; - m.buffer_size = buffer_size; - Mergers.push_back(m); - } - - template - void CommsMerge(decompressor decompress) { - - // Also do a precision convert possibly on a receive buffer - for(int i=0;i _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; - - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - - void PrecomputeByteOffsets(void){ - for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); - } else { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); - } - } - }; inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; @@ -362,23 +221,196 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if (local) return base + _entries[ent]._byte_offset; else return cbase + _entries[ent]._byte_offset; } - - /////////////////////////////////////////////////////////// - // Unified Comms buffers for all directions - /////////////////////////////////////////////////////////// - // Vectors that live on the symmetric heap in case of SHMEM - // These are used; either SHM objects or refs to the above symmetric heap vectors - // depending on comms target - cobj* u_recv_buf_p; - cobj* u_send_buf_p; - std::vector u_simd_send_buf; - std::vector u_simd_recv_buf; - int u_comm_offset; - int _unified_buffer_size; - - cobj *CommBuf(void) { return u_recv_buf_p; } + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// + void CommunicateBegin(std::vector > &reqs) + { + reqs.resize(Packets.size()); + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + } + commtime+=usecond(); + } + void CommunicateComplete(std::vector > &reqs) + { + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + } + _grid->StencilBarrier();// Synch shared memory on a single nodes + commtime+=usecond(); + } + + template void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + Prepare(); + HaloGather(source,compress); + CommunicateBegin(reqs); + CommunicateComplete(reqs); + CommsMergeSHM(compress); + CommsMerge(compress); + } + + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int same_node = 1; + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + same_node = same_node && GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + // if checkerboard is unfavourable take two passes + // both with block stride loop iteration + same_node = same_node && GatherSimd(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && GatherSimd(source,dimension,shift,0x2,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + same_node = same_node && Gather(source,dimension,shift,0x1,compress,face_idx); + same_node = same_node && Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } + } + } + return same_node; + } + + template + void HaloGather(const Lattice &source,compressor &compress) + { + _grid->StencilBarrier();// Synch shared memory on a single nodes + + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); + } + + ///////////////////////// + // Implementation + ///////////////////////// + void Prepare(void) + { + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); + Packets.resize(0); + calls++; + } + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + Packets.push_back(p); + } + void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector &dv) { + Decompress d; + d.kernel_p = k_p; + d.mpi_p = m_p; + d.buffer_size = buffer_size; + dv.push_back(d); + } + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { + Merge m; + m.type = type; + m.mpointer = merge_p; + m.vpointers= rpointers; + m.buffer_size = buffer_size; + mv.push_back(m); + } + template void CommsMerge(decompressor decompress) { CommsMerge(decompress,Mergers,Decompressions); } + template void CommsMergeSHM(decompressor decompress) { CommsMerge(decompress,MergersSHM,DecompressionsSHM);} + + template + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { + + for(int i=0;i void HaloExchange(const Lattice &source,compressor &compress) - { - std::vector > reqs; - Prepare(); - HaloGather(source,compress); - CommunicateBegin(reqs); - CommunicateComplete(reqs); - CommsMerge(compress); - } - - template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x3,compress,face_idx); - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x3,compress,face_idx); - nosplicetime+=usecond(); - } - } else { - if(splice_dim){ - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes - GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x1,compress,face_idx); - Gather(source,dimension,shift,0x2,compress,face_idx); - nosplicetime+=usecond(); - } - } - } - } - template - void HaloGather(const Lattice &source,compressor &compress) - { - _grid->StencilBarrier();// Synch shared memory on a single nodes - - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; @@ -804,6 +752,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + int shm_receive_only = 1; for(int x=0;xShmBufferTranslate(xmit_to_rank,u_recv_buf_p); + cobj *send_buf; + cobj *recv_buf; + if ( compress.DecompressionStep() ) { + recv_buf=u_simd_recv_buf[0]; + } else { + recv_buf=u_recv_buf_p; + } + + send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); if ( send_buf==NULL ) { send_buf = u_send_buf_p; + shm_receive_only = 0; } + + // Find out if we get the direct copy. + void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_send_buf_p); + if (success==NULL) { + // we found a packet that comes from MPI and contributes to this leg of stencil + shm_receive_only = 0; + } - cobj *recv_buf; gathertime-=usecond(); assert(send_buf!=NULL); Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); if ( compress.DecompressionStep() ) { - - recv_buf = &u_simd_recv_buf[0][u_comm_offset]; - AddDecompress(&u_recv_buf_p[u_comm_offset], - &recv_buf[u_comm_offset], - words); + if ( shm_receive_only ) { // Early decompress before MPI is finished is possible + AddDecompress(&u_recv_buf_p[u_comm_offset], + &recv_buf[u_comm_offset], + words,DecompressionsSHM); + } else { // Decompress after MPI is finished + AddDecompress(&u_recv_buf_p[u_comm_offset], + &recv_buf[u_comm_offset], + words,Decompressions); + } AddPacket((void *)&send_buf[u_comm_offset], (void *)&recv_buf[u_comm_offset], @@ -868,10 +836,11 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal u_comm_offset+=words; } } + return shm_receive_only; } template - void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { const int Nsimd = _grid->Nsimd(); @@ -917,12 +886,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); // loop over outer coord planes orthog to dim + int shm_receive_only = 1; for(int x=0;x= rd ); if ( any_offnode ) { - for(int i=0;iShmBufferTranslate(recv_from_rank,sp); if (shm==NULL) { shm = rp; + shm_receive_only = 0; // we found a packet that comes from MPI and contributes to this + // leg of stencil } - // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node // assuming above pointer flip + rpointers[i] = shm; + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - rpointers[i] = shm; } else { @@ -983,12 +954,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } } - AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type); + if ( shm_receive_only ) { + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); + } else { + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + } u_comm_offset +=buffer_size; } } + return shm_receive_only; } + + void ZeroCounters(void) { + gathertime = 0.; + commtime = 0.; + halogtime = 0.; + mergetime = 0.; + decompresstime = 0.; + gathermtime = 0.; + splicetime = 0.; + nosplicetime = 0.; + comms_bytes = 0.; + calls = 0.; + }; + + void Report(void) { +#define AVERAGE(A) _grid->GlobalSum(A);A/=NP; +#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; + RealD NN = _grid->NodeCount(); + + _grid->GlobalSum(commtime); commtime/=NP; + if ( calls > 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"< Date: Sat, 22 Apr 2017 08:11:51 -0400 Subject: [PATCH 13/31] Fixing the KNL compile --- .../action/fermion/WilsonKernelsAsmAvx512.h | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h index 1839e9bc..948c16a2 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h @@ -71,6 +71,16 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doub int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -84,6 +94,16 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -97,6 +117,16 @@ template<> void WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -115,6 +145,16 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -128,6 +168,16 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -141,6 +191,16 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) @@ -162,6 +222,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -174,6 +243,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -189,6 +267,16 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -205,6 +293,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -217,6 +314,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -229,6 +335,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN @@ -269,6 +384,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,Doub int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -281,6 +405,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -293,6 +426,15 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -309,6 +451,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,D int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -321,6 +472,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -333,6 +493,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & l int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef MAYBEPERM #undef MULT_2SPIN #define MAYBEPERM(A,B) @@ -354,6 +523,15 @@ WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -366,6 +544,15 @@ WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -380,6 +567,15 @@ WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single ///////////////////////////////////////////////////////////////// @@ -396,6 +592,15 @@ WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrd int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR @@ -408,6 +613,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR @@ -420,6 +634,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,Lebesgue int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef COMPLEX_SIGNS #undef MAYBEPERM #undef MULT_2SPIN From 1d1b2254976674c30fe5bea7491386ca47348bd8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 22 Apr 2017 09:05:28 -0400 Subject: [PATCH 14/31] Hand unrolled Nc=3 kernels support split phase compute (on-node, off-node). --- benchmarks/Benchmark_dwf.cc | 3 +- lib/qcd/action/fermion/WilsonKernels.h | 12 ++ lib/qcd/action/fermion/WilsonKernelsHand.cc | 115 ++++++++++++++++++++ 3 files changed, 129 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 54e648f0..838e3a82 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -166,11 +166,12 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters(); Dw.Dhop(src,result,0); + std::cout< LOAD_CHI; \ MULT_2SPIN(DIR); \ RECON; \ + nmu++; \ } #define HAND_RESULT(ss) \ @@ -375,6 +376,23 @@ Author: paboyle vstream(ref()(3)(2),result_32); \ } +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out._odata[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + } + #define HAND_DECLARATIONS(a) \ Simd result_00; \ @@ -471,6 +489,103 @@ void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub HAND_RESULT(ss); } +template void +WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + + result=zero; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + result=zero; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + int nmu=0; + result=zero; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + int nmu=0; + result=zero; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + //////////////////////////////////////////////// // Specialise Gparity to simple implementation //////////////////////////////////////////////// From f301be94ce9e0c08a12482b69dc131fa82fd2a0d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 17:42:31 +0100 Subject: [PATCH 15/31] Fixed --- lib/qcd/action/fermion/WilsonKernelsHand.cc | 174 +++++++++++--------- 1 file changed, 92 insertions(+), 82 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 4e2db8c1..96b8ab0a 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -31,7 +31,7 @@ Author: paboyle #define REGISTER #define LOAD_CHIMU \ - const SiteSpinor & ref (in._odata[offset]); \ + {const SiteSpinor & ref (in._odata[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ Chimu_02=ref()(0)(2);\ @@ -43,20 +43,20 @@ Author: paboyle Chimu_22=ref()(2)(2);\ Chimu_30=ref()(3)(0);\ Chimu_31=ref()(3)(1);\ - Chimu_32=ref()(3)(2); + Chimu_32=ref()(3)(2);} #define LOAD_CHI\ - const SiteHalfSpinor &ref(buf[offset]); \ + {const SiteHalfSpinor &ref(buf[offset]); \ Chi_00 = ref()(0)(0);\ Chi_01 = ref()(0)(1);\ Chi_02 = ref()(0)(2);\ Chi_10 = ref()(1)(0);\ Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2); + Chi_12 = ref()(1)(2);} // To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ - auto & ref(U._odata[sU](A)); \ + {auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ Impl::loadLinkElement(U_20,ref()(2,0)); \ @@ -83,7 +83,7 @@ Author: paboyle UChi_01+= U_10*Chi_02;\ UChi_11+= U_10*Chi_12;\ UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12; + UChi_12+= U_20*Chi_12;} #define PERMUTE_DIR(dir) \ @@ -321,9 +321,7 @@ Author: paboyle } else { \ LOAD_CHI; \ } \ - { \ - MULT_2SPIN(DIR); \ - } \ + MULT_2SPIN(DIR); \ RECON; #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ @@ -337,10 +335,8 @@ Author: paboyle if ( perm) { \ PERMUTE_DIR(PERM); \ } \ - } else { \ - if ( st.same_node[DIR] ) { \ - LOAD_CHI; \ - } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ } \ if (local || st.same_node[DIR] ) { \ MULT_2SPIN(DIR); \ @@ -348,11 +344,9 @@ Author: paboyle } #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ - SE=st.GetEntry(ptype,Dir,ss); \ + SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ - local = SE->_is_local; \ - perm = SE->_permute; \ - if((!SE->_is_local)&&(!st.same_node[Dir]) ) { \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ LOAD_CHI; \ MULT_2SPIN(DIR); \ RECON; \ @@ -424,7 +418,21 @@ Author: paboyle Simd U_20; \ Simd U_01; \ Simd U_11; \ - Simd U_21; + Simd U_21; + +#define ZERO_RESULT \ + result_00=zero; \ + result_01=zero; \ + result_02=zero; \ + result_10=zero; \ + result_11=zero; \ + result_12=zero; \ + result_20=zero; \ + result_21=zero; \ + result_22=zero; \ + result_30=zero; \ + result_31=zero; \ + result_32=zero; #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -501,8 +509,7 @@ WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa int offset,local,perm, ptype; StencilEntry *SE; - - result=zero; + ZERO_RESULT; HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); @@ -525,7 +532,7 @@ void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D StencilEntry *SE; int offset,local,perm, ptype; - result=zero; + ZERO_RESULT; HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); @@ -550,7 +557,7 @@ WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa int offset,local,perm, ptype; StencilEntry *SE; int nmu=0; - result=zero; + ZERO_RESULT; HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); @@ -574,7 +581,7 @@ void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D StencilEntry *SE; int offset,local,perm, ptype; int nmu=0; - result=zero; + ZERO_RESULT; HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); @@ -589,65 +596,60 @@ void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D //////////////////////////////////////////////// // Specialise Gparity to simple implementation //////////////////////////////////////////////// -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} +#define HAND_SPECIALISE_EMPTY(IMPL) \ + template<> void \ + WilsonKernels::HandDhopSite(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDag(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteInt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteExt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDagInt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ + template<> void \ + WilsonKernels::HandDhopSiteDagExt(StencilImpl &st, \ + LebesgueOrder &lo, \ + DoubledGaugeField &U, \ + SiteHalfSpinor *buf, \ + int sF,int sU, \ + const FermionField &in, \ + FermionField &out){ assert(0); } \ -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} - -template<> void -WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, - int sF,int sU,const FermionField &in, FermionField &out) -{ - assert(0); -} + HAND_SPECIALISE_EMPTY(GparityWilsonImplF); + HAND_SPECIALISE_EMPTY(GparityWilsonImplD); + HAND_SPECIALISE_EMPTY(GparityWilsonImplFH); + HAND_SPECIALISE_EMPTY(GparityWilsonImplDF); ////////////// Wilson ; uses this implementation ///////////////////// @@ -655,7 +657,15 @@ WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrde template void WilsonKernels::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ int ss,int sU,const FermionField &in, FermionField &out); \ template void WilsonKernels::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ - int ss,int sU,const FermionField &in, FermionField &out); + int ss,int sU,const FermionField &in, FermionField &out);\ +template void WilsonKernels::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); From abba44a8372b3f8cde172cea71730f395f357fb6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 17:45:17 +0100 Subject: [PATCH 16/31] Hand unrolled for overlapped comms --- lib/qcd/action/fermion/WilsonKernels.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 83d2c059..72f10b7f 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -72,7 +72,9 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); sF++; } sU++; @@ -131,7 +133,10 @@ public: case OptHandUnroll: for (int site = 0; site < Ns; site++) { for (int s = 0; s < Ls; s++) { - if( exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + if(interior&&exterior) WilsonKernels::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); sF++; } sU++; From b722889234a02200bbafde2a73e8d43b5c9a5282 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 19:27:41 +0100 Subject: [PATCH 17/31] Try a better load balancing loop --- lib/qcd/action/fermion/WilsonFermion5D.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 9d325ed5..daddb605 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -406,6 +406,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. +#if 1 #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -421,9 +422,28 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); if ( me==0 ) DhopComputeTime2+=usecond(); }// end parallel region +#else +DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } else { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } +DhopComputeTime2+=usecond(); +#endif + #else assert(0); #endif + } template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, From 3703b718aa3a4196ff682f26315079d2ccc1fe19 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 19:28:37 +0100 Subject: [PATCH 18/31] Mark up a table if a given site only receives from itself; including MPI3 splitting info. --- lib/stencil/Stencil.h | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 32d1255a..19ad90e2 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -94,7 +94,8 @@ void Gather_plane_exchange_table(std::vector >& table,const L uint64_t _byte_offset; uint16_t _is_local; uint16_t _permute; - uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline + uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline + uint16_t _node_local; }; //////////////////////////////////////// @@ -170,15 +171,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int u_comm_offset; int _unified_buffer_size; -/* - std::vector compute2sites; - const std::vector &Compute2Sites(void) { return compute2sites; } - void CalculateCompute2Sites(void) { - for(int i=0;i< _entries.size();i++){ - - } - } -*/ cobj *CommBuf(void) { return u_recv_buf_p; } ///////////////////////////////////////// @@ -198,7 +190,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal //////////////////////////////////////// // Stencil query //////////////////////////////////////// - + inline int GetNodeLocal(int osite) { + return _entries[_npoints*osite]._node_local; + } inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } @@ -246,7 +240,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); } - _grid->StencilBarrier();// Synch shared memory on a single nodes commtime+=usecond(); } @@ -373,8 +366,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal m.buffer_size = buffer_size; mv.push_back(m); } - template void CommsMerge(decompressor decompress) { CommsMerge(decompress,Mergers,Decompressions); } - template void CommsMergeSHM(decompressor decompress) { CommsMerge(decompress,MergersSHM,DecompressionsSHM);} + template void CommsMerge(decompressor decompress) { + CommsMerge(decompress,Mergers,Decompressions); + } + template void CommsMergeSHM(decompressor decompress) { + _grid->StencilBarrier();// Synch shared memory on a single nodes + CommsMerge(decompress,MergersSHM,DecompressionsSHM); + } template void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { @@ -410,6 +408,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); } } + int osites = _grid->oSites(); + for(int o=0;oShmBufferTranslate(recv_from_rank,sp); if (shm==NULL) { shm = rp; - shm_receive_only = 0; // we found a packet that comes from MPI and contributes to this + // we found a packet that comes from MPI and contributes to this shift. + // same_node is only used in the WilsonStencil, and gets set for this point in the stencil. + // Kernel will add the exterior_terms except if same_node. + shm_receive_only = 0; // leg of stencil } // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node From ac58565d0a91a6fc3d63e12d7797f6218a473cea Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 19:31:04 +0100 Subject: [PATCH 19/31] Dangerous rewrite of the assembly. If I make a mistake the debug will be painful. --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 213 ++++++------------ 1 file changed, 70 insertions(+), 143 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 34aba472..effbec1f 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -39,24 +39,26 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR_AND_EXTERIOR -#define ZERO_NMU(A) -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) -#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + LOAD64(%r10,isigns); \ + RECON; \ -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - LOAD64(%r10,isigns); \ - PROJMEM(base); \ - MAYBEPERM(PERMUTE_DIR,perm); - -#define EXTERIOR_BLOCK(a,b,RECON) \ - LOAD_CHI(base); - -#define COMMON_BLOCK(a,b,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(a,basep); \ - LOAD64(%r10,isigns); \ - RECON; +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -67,31 +69,33 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR -#define COMMON_BLOCK(a,b,RECON) -#define ZERO_NMU(A) +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD64(%r10,isigns); \ + PROJ(base); \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else if ( st.same_dir[Dir] ) { \ + LOAD_CHI(base); \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if ( local || st.same_node[Dir] ) { \ + PREFETCH_CHIMU(base); \ + MULT_2SPIN_DIR_PF(Dir,basep); \ + LOAD64(%r10,isigns); \ + RECON; \ + } -// No accumulate for DIR0 -#define EXTERIOR_BLOCK_XP(a,b,RECON) \ - ZERO_PSI; \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK(a,b,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) - -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - LOAD64(%r10,isigns); \ - PROJMEM(base); \ - MAYBEPERM(PERMUTE_DIR,perm); \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ - MULT_2SPIN_DIR_PF(a,basep); \ - LOAD64(%r10,isigns); \ - RECON; +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PF_GAUGE(Xp); \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) else { ZERO_PSI; } #define RESULT(base,basep) SAVE_RESULT(base,basep); +#define ZERO_NMU(A) + #endif //////////////////////////////////////////////////////////////////////////////// @@ -99,30 +103,24 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + #define ZERO_NMU(A) nmu=0; - -#define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - ZERO_PSI; base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) - -#define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; - -#define EXTERIOR_BLOCK(a,b,RECON) \ - nmu++; \ - LOAD_CHI(base); \ - MULT_2SPIN_DIR_PF(a,base); \ - base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ - LOAD64(%r10,isigns); \ - RECON; - -#define COMMON_BLOCK(a,b,RECON) - + #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif - { int nmu; int local,perm, ptype; @@ -147,91 +145,20 @@ int nent=ssn*8; ZERO_NMU(0); - base = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++; -#ifndef EXTERIOR - PF_GAUGE(Xp); - PREFETCH1_CHIMU(base); -#endif - //////////////////////////////// - // Xp - //////////////////////////////// - basep = st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); - } else { - EXTERIOR_BLOCK_XP(Xp,Yp,DIR0_RECON); - } - COMMON_BLOCK(Xp,Yp,DIR0_RECON); - //////////////////////////////// - // Yp - //////////////////////////////// - basep = st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); - } else { - EXTERIOR_BLOCK(Yp,Zp,DIR1_RECON); - } - COMMON_BLOCK(Yp,Zp,DIR1_RECON); - //////////////////////////////// - // Zp - //////////////////////////////// - basep = st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); - } else { - EXTERIOR_BLOCK(Zp,Tp,DIR2_RECON); - } - COMMON_BLOCK(Zp,Tp,DIR2_RECON); - //////////////////////////////// - // Tp - //////////////////////////////// - basep = st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); - } else { - EXTERIOR_BLOCK(Tp,Xm,DIR3_RECON); - } - COMMON_BLOCK(Tp,Xm,DIR3_RECON); - //////////////////////////////// - // Xm - //////////////////////////////// - // basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON); - } else { - EXTERIOR_BLOCK(Xm,Ym,DIR4_RECON); - } - COMMON_BLOCK(Xm,Ym,DIR4_RECON); - //////////////////////////////// - // Ym - //////////////////////////////// - basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON); - } else { - EXTERIOR_BLOCK(Ym,Zm,DIR5_RECON); - } - COMMON_BLOCK(Ym,Zm,DIR5_RECON); - //////////////////////////////// - // Zm - //////////////////////////////// - basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); - } else { - EXTERIOR_BLOCK(Zm,Tm,DIR6_RECON); - } - COMMON_BLOCK(Zm,Tm,DIR6_RECON); - //////////////////////////////// - // Tm - //////////////////////////////// - basep= st.GetPFInfo(nent,plocal); nent++; - if ( local ) { - INTERIOR_BLOCK(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); - } else { - EXTERIOR_BLOCK(Tm,Xp,DIR7_RECON); - } - COMMON_BLOCK(Tm,Xp,DIR7_RECON); + + ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); + ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); + ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); + ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); + + ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON); + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON); + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); + +#ifdef EXTERIOR + if( nmu == 0 ) break; +#endif base = (uint64_t) &out._odata[ss]; basep= st.GetPFInfo(nent,plocal); nent++; From c429ace748f719b2fe2297ae2ffcc90fa974fe14 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 20:28:42 +0100 Subject: [PATCH 20/31] Cleaner OpenMP use --- benchmarks/Benchmark_dwf.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 838e3a82..7009980c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -166,7 +166,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters(); From 4dd3763294761a03b69cf6c40cb2424c58d775b5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 20:35:20 +0100 Subject: [PATCH 21/31] Use OMP as much as possible --- lib/qcd/action/fermion/WilsonFermion5D.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index daddb605..79c467a5 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -406,7 +406,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. -#if 1 +#if 0 #pragma omp parallel { int nthreads = omp_get_num_threads(); From 5812eb8a8c9c3c130809eee3cd886e64fad5547f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 22 Apr 2017 18:50:25 -0400 Subject: [PATCH 22/31] Partially fixed. But the comms-overlap does not work yet. --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 46 ++++++++----------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index effbec1f..655aa255 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -55,7 +55,7 @@ RECON; \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PF_GAUGE(Xp); \ PREFETCH1_CHIMU(base); \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) @@ -75,38 +75,34 @@ LOAD64(%r10,isigns); \ PROJ(base); \ MAYBEPERM(PERMUTE_DIR,perm); \ - } else if ( st.same_dir[Dir] ) { \ + } else if ( st.same_node[Dir] ) { \ LOAD_CHI(base); \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ if ( local || st.same_node[Dir] ) { \ - PREFETCH_CHIMU(base); \ MULT_2SPIN_DIR_PF(Dir,basep); \ LOAD64(%r10,isigns); \ RECON; \ } #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PF_GAUGE(Xp); \ PREFETCH1_CHIMU(base); \ - ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) else { ZERO_PSI; } + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) else { ZERO_PSI; } #define RESULT(base,basep) SAVE_RESULT(base,basep); -#define ZERO_NMU(A) - #endif - //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR - #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_DIR_PF(Dir,base); \ LOAD64(%r10,isigns); \ @@ -114,11 +110,11 @@ nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) else { ZERO_PSI; } -#define ZERO_NMU(A) nmu=0; - -#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif { @@ -144,9 +140,7 @@ int ent=ss*8;// 2*Ndim int nent=ssn*8; - ZERO_NMU(0); - - ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); + ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON); ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON); ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON); ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON); @@ -156,10 +150,10 @@ ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON); ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); -#ifdef EXTERIOR - if( nmu == 0 ) break; -#endif - +#ifndef EXTERIOR + //Early out if no work + //if (nmu==0) break; +#endif base = (uint64_t) &out._odata[ss]; basep= st.GetPFInfo(nent,plocal); nent++; RESULT(base,basep); @@ -185,10 +179,6 @@ #undef DIR5_RECON #undef DIR6_RECON #undef DIR7_RECON -#undef EXTERIOR_BLOCK -#undef INTERIOR_BLOCK -#undef EXTERIOR_BLOCK_XP -#undef INTERIOR_BLOCK_XP -#undef COMMON_BLOCK -#undef ZERO_NMU +#undef ASM_LEG +#undef ASM_LEG_XP #undef RESULT From e3d0e31525541a7e2dea9212c2a1b4d85806fb80 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 23 Apr 2017 19:29:27 -0400 Subject: [PATCH 23/31] Debugged assemply split phase with interior suppression --- lib/qcd/action/fermion/WilsonFermion5D.cc | 8 +++++--- lib/qcd/action/fermion/WilsonKernels.cc | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index daddb605..cacc7a9f 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -406,7 +406,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. -#if 1 +#if 0 #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -425,13 +425,15 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg #else DhopComputeTime2-=usecond(); if (dag == DaggerYes) { - parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { +#pragma omp parallel for schedule(static,4) + for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); } } else { - parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { +#pragma omp parallel for schedule(static,1) + for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 6ed350cf..03c066b0 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -33,8 +33,8 @@ directory namespace Grid { namespace QCD { - int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; - int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; +int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric; +int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; template WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; From 3accb1ef8973bf2bbc0678b01b2405b5ed26b761 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 23 Apr 2017 19:30:19 -0400 Subject: [PATCH 24/31] Debugged assemply split phase with interior suppression --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 655aa255..d424c12b 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -75,22 +75,20 @@ LOAD64(%r10,isigns); \ PROJ(base); \ MAYBEPERM(PERMUTE_DIR,perm); \ - } else if ( st.same_node[Dir] ) { \ - LOAD_CHI(base); \ - } \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - PREFETCH_CHIMU(base); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ if ( local || st.same_node[Dir] ) { \ MULT_2SPIN_DIR_PF(Dir,basep); \ LOAD64(%r10,isigns); \ RECON; \ - } + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PF_GAUGE(Xp); \ - PREFETCH1_CHIMU(base); \ - ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) else { ZERO_PSI; } + PREFETCH1_CHIMU(base);{ ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -100,8 +98,10 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR + #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + /*if((!local) ) {*/ \ if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_DIR_PF(Dir,base); \ @@ -111,8 +111,8 @@ } #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) else { ZERO_PSI; } + nmu=0;{ ZERO_PSI; } \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} @@ -151,8 +151,7 @@ ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON); #ifndef EXTERIOR - //Early out if no work - //if (nmu==0) break; + if (nmu==0) break; //Early out if no work #endif base = (uint64_t) &out._odata[ss]; basep= st.GetPFInfo(nent,plocal); nent++; From 5b55867a7a495e33e0880d89167c5d1915dc4a02 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 24 Apr 2017 05:36:11 -0400 Subject: [PATCH 25/31] Slightly cheaper Ext assembly --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index d424c12b..c9989cc2 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -101,7 +101,6 @@ #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - /*if((!local) ) {*/ \ if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ MULT_2SPIN_DIR_PF(Dir,base); \ @@ -111,8 +110,15 @@ } #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0;{ ZERO_PSI; } \ - ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + nmu=0; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_DIR_PF(Dir,base); \ + LOAD64(%r10,isigns); \ + RECON; \ + nmu++; \ + } else { ZERO_PSI; }; #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} From 56277a11c8b75717c0eb01782d74bd022ad7aab3 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 24 Apr 2017 17:06:15 +0100 Subject: [PATCH 26/31] Build a list of whats on the surface --- TODO | 7 ++--- lib/qcd/action/fermion/WilsonCompressor.h | 27 +++++++++++++++-- lib/qcd/action/fermion/WilsonFermion5D.cc | 36 +++++++++++++++++++++++ lib/simd/Grid_avx.h | 10 +++---- lib/stencil/Stencil.h | 34 +++++++++++++++++++-- 5 files changed, 101 insertions(+), 13 deletions(-) diff --git a/TODO b/TODO index a2b7bf03..672879cd 100644 --- a/TODO +++ b/TODO @@ -2,21 +2,20 @@ TODO: --------------- Peter's work list: -1)- Half-precision comms <-- started -- SIMD is prepared 2)- Precision conversion and sort out localConvert <-- - 3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 4)- Binary I/O speed up & x-strips - -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet -- Physical propagator interface -- Conserved currents -- GaugeFix into central location - -- Multigrid Wilson and DWF, compare to other Multigrid implementations -- HDCR resume Recent DONE +-- Cut down the exterior overhead <-- DONE +-- Interior legs from SHM comms <-- DONE +-- Half-precision comms <-- DONE -- Merge high precision reduction into develop -- multiRHS DWF; benchmark on Cori/BNL for comms elimination -- slice* linalg routines for multiRHS, BlockCG diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 59fcc1f8..2351d4cd 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -242,6 +242,7 @@ public: typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; std::vector same_node; + std::vector surface_list; WilsonStencil(GridBase *grid, int npoints, @@ -249,11 +250,33 @@ public: const std::vector &directions, const std::vector &distances) : CartesianStencil (grid,npoints,checkerboard,directions,distances) , - same_node(npoints) + same_node(npoints) { - assert(npoints==8);// or 10 if do naive DWF 5d red black ? + surface_list.resize(0); }; + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + std::cout << " dir " <_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index cacc7a9f..11ab1ce2 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -117,6 +117,19 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // Allocate the required comms buffer ImportGauge(_Umu); + + // Build lists of exterior only nodes + int LLs = FourDimGrid._rdimensions[0]; + int vol4; + vol4=FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); + + std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() + <<" " << StencilEven.surface_list.size()< @@ -406,6 +419,8 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. +#if 1 + #if 0 #pragma omp parallel { @@ -422,6 +437,27 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); if ( me==0 ) DhopComputeTime2+=usecond(); }// end parallel region +#else + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { +#pragma omp parallel for schedule(static,1) + for (int ss = 0; ss < st.surface_list.size(); ss++) { + int sU = st.surface_list[ss]; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } else { +#pragma omp parallel for schedule(static,1) + for (int ss = 0; ss < st.surface_list.size(); ss++) { + int sU = st.surface_list[ss]; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } + DhopComputeTime2+=usecond(); +#endif + + #else DhopComputeTime2-=usecond(); if (dag == DaggerYes) { diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 5c16dd71..02348686 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -473,12 +473,12 @@ namespace Optimization { #define USE_FP16 struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { - __m256 h; + __m256i h; #ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); - h = _mm256_castps128_ps256(ha); - h = _mm256_insertf128_ps(h,hb,1); + h =(__m256i) _mm256_castps128_ps256((__m128)ha); + h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1); #else assert(0); #endif @@ -486,8 +486,8 @@ namespace Optimization { } static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { #ifdef USE_FP16 - sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); - sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); + sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0)); + sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1)); #else assert(0); #endif diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 19ad90e2..44b766de 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -190,8 +190,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal //////////////////////////////////////// // Stencil query //////////////////////////////////////// - inline int GetNodeLocal(int osite) { - return _entries[_npoints*osite]._node_local; + inline int SameNode(int point) { + + int dimension = _directions[point]; + int displacement = _distances[point]; + assert( (displacement==1) || (displacement==-1)); + + int pd = _grid->_processors[dimension]; + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + int recv_from_rank; + int xmit_to_rank; + + if ( ! comm_dim ) return 1; + + int nbr_proc; + if (displacement==1) nbr_proc = 1; + else nbr_proc = pd-1; + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); + + if ( shm==NULL ) return 0; + + return 1; + } + inline int GetNodeLocal(int osite,int point) { + return _entries[point+_npoints*osite]._node_local; } inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; From ab66bac4e6d7a6c2d4091faf3d61a3f59f1d88bd Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 25 Apr 2017 08:50:26 +0100 Subject: [PATCH 27/31] Think I'm getting on top of the reduced cost exterior precomputed list of links --- benchmarks/Benchmark_dwf.cc | 2 +- lib/qcd/action/fermion/WilsonCompressor.h | 34 +++++++++++------------ lib/qcd/action/fermion/WilsonFermion5D.cc | 4 +-- lib/stencil/Stencil.h | 16 ++--------- 4 files changed, 22 insertions(+), 34 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 7009980c..838e3a82 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -166,7 +166,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters(); diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 2351d4cd..96cbe1ec 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -261,7 +261,7 @@ public: // Here we know the distance is 1 for WilsonStencil for(int point=0;point_npoints;point++){ same_node[point] = this->SameNode(point); - std::cout << " dir " <HaloGatherDir(source,XpCompress,Xp,face_idx); - same_node[Yp]=this->HaloGatherDir(source,YpCompress,Yp,face_idx); - same_node[Zp]=this->HaloGatherDir(source,ZpCompress,Zp,face_idx); - same_node[Tp]=this->HaloGatherDir(source,TpCompress,Tp,face_idx); - same_node[Xm]=this->HaloGatherDir(source,XmCompress,Xm,face_idx); - same_node[Ym]=this->HaloGatherDir(source,YmCompress,Ym,face_idx); - same_node[Zm]=this->HaloGatherDir(source,ZmCompress,Zm,face_idx); - same_node[Tm]=this->HaloGatherDir(source,TmCompress,Tm,face_idx); + assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - same_node[Xp]=this->HaloGatherDir(source,XmCompress,Xp,face_idx); - same_node[Yp]=this->HaloGatherDir(source,YmCompress,Yp,face_idx); - same_node[Zp]=this->HaloGatherDir(source,ZmCompress,Zp,face_idx); - same_node[Tp]=this->HaloGatherDir(source,TmCompress,Tp,face_idx); - same_node[Xm]=this->HaloGatherDir(source,XpCompress,Xm,face_idx); - same_node[Ym]=this->HaloGatherDir(source,YpCompress,Ym,face_idx); - same_node[Zm]=this->HaloGatherDir(source,ZpCompress,Zm,face_idx); - same_node[Tm]=this->HaloGatherDir(source,TpCompress,Tm,face_idx); + assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 11ab1ce2..13d26086 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -119,7 +119,7 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, ImportGauge(_Umu); // Build lists of exterior only nodes - int LLs = FourDimGrid._rdimensions[0]; + int LLs = FiveDimGrid._rdimensions[0]; int vol4; vol4=FourDimGrid.oSites(); Stencil.BuildSurfaceList(LLs,vol4); @@ -127,7 +127,7 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); - std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() + std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() <<" " << StencilEven.surface_list.size()< >& table,const L uint16_t _is_local; uint16_t _permute; uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline - uint16_t _node_local; + uint16_t _pad; }; //////////////////////////////////////// @@ -221,7 +221,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal return 1; } inline int GetNodeLocal(int osite,int point) { - return _entries[point+_npoints*osite]._node_local; + return _entries[point+_npoints*osite]._is_local; } inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; @@ -438,18 +438,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); } } - int osites = _grid->oSites(); - for(int o=0;o Date: Wed, 26 Apr 2017 02:34:25 -0400 Subject: [PATCH 28/31] Pretty code --- lib/qcd/action/fermion/WilsonKernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 72f10b7f..2cf52660 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -63,7 +63,7 @@ public: switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - if(interior&&exterior) WilsonKernels::AsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSite (st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (interior) WilsonKernels::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (exterior) WilsonKernels::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else assert(0); @@ -124,7 +124,7 @@ public: switch(Opt) { #if defined(AVX512) || defined (QPX) case OptInlineAsm: - if(interior&&exterior) WilsonKernels::AsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + if(interior&&exterior) WilsonKernels::AsmDhopSiteDag (st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (interior) WilsonKernels::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else if (exterior) WilsonKernels::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out); else assert(0); From fd1eb7de13f56b7407f90994a7e00d5b12961da2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 26 Apr 2017 02:34:52 -0400 Subject: [PATCH 29/31] Clean implementation of the exterior faces listing only those points on the boudary --- lib/qcd/action/fermion/WilsonFermion5D.cc | 57 ++++------------------- 1 file changed, 10 insertions(+), 47 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 13d26086..fb20108b 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -367,6 +367,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DhopTotalTime+=usecond(); } + template void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, @@ -380,7 +381,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg int LLs = in._grid->_rdimensions[0]; int len = U._grid->oSites(); - + DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); DhopFaceTime+=usecond(); @@ -390,6 +391,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommunicateBegin(reqs); st.CommsMergeSHM(compressor); + // Perhaps use omp task and region #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -419,70 +421,31 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. -#if 1 - -#if 0 -#pragma omp parallel - { - int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; - - GridThread::GetWork(len,me,mywork,myoff,nthreads); - int sF = LLs * myoff; - - // Exterior links in stencil - if ( me==0 ) DhopComputeTime2-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - if ( me==0 ) DhopComputeTime2+=usecond(); - }// end parallel region -#else DhopComputeTime2-=usecond(); if (dag == DaggerYes) { -#pragma omp parallel for schedule(static,1) - for (int ss = 0; ss < st.surface_list.size(); ss++) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { int sU = st.surface_list[ss]; int sF = LLs * sU; Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); } } else { -#pragma omp parallel for schedule(static,1) - for (int ss = 0; ss < st.surface_list.size(); ss++) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { int sU = st.surface_list[ss]; int sF = LLs * sU; Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); } } DhopComputeTime2+=usecond(); -#endif - - -#else -DhopComputeTime2-=usecond(); - if (dag == DaggerYes) { -#pragma omp parallel for schedule(static,4) - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); - } - } else { -#pragma omp parallel for schedule(static,1) - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); - } - } -DhopComputeTime2+=usecond(); -#endif - #else assert(0); #endif } + + + template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, From f8797e1e3e07fb5c828658246376167f58dd2ccb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 26 Apr 2017 03:14:02 -0400 Subject: [PATCH 30/31] bug fix. works now and great face performance --- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index c9989cc2..db8651ab 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -87,7 +87,8 @@ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PF_GAUGE(Xp); \ - PREFETCH1_CHIMU(base);{ ZERO_PSI; } \ + PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); @@ -111,6 +112,7 @@ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ nmu=0; \ + { ZERO_PSI;} \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \ @@ -118,7 +120,7 @@ LOAD64(%r10,isigns); \ RECON; \ nmu++; \ - } else { ZERO_PSI; }; + } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} @@ -134,11 +136,15 @@ MASK_REGS; int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; int sUn=lo.Reorder(ssn); -#ifndef EXTERIOR LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; #endif for(int s=0;s Date: Wed, 26 Apr 2017 08:43:20 +0100 Subject: [PATCH 31/31] longer nloop --- benchmarks/Benchmark_dwf.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 838e3a82..7009980c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -166,7 +166,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.ZeroCounters();