From a32ac287bb1eccc88088de51d5fc483f5f8e2f72 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 May 2015 19:54:03 +0100 Subject: [PATCH 1/4] Hand unrolled version of dslash in a separate class. Useful to compare; raises Intel compiler from 9GFlop/s to 17.5 Gflops. on ivybridge core. Raises Clang form 14.5 to 17.5 --- benchmarks/Grid_wilson.cc | 14 +- lib/Grid.h | 1 - lib/Grid_init.cc | 3 + lib/Grid_simd.h | 100 +++++-- lib/Makefile.am | 2 + lib/lattice/Grid_lattice_base.h | 12 +- lib/lattice/Grid_lattice_conformable.h | 7 +- lib/qcd/Grid_qcd_wilson_dop.cc | 343 ++----------------------- lib/qcd/Grid_qcd_wilson_dop.h | 45 +++- 9 files changed, 157 insertions(+), 370 deletions(-) diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc index 32255b3e..3b0d04bc 100644 --- a/benchmarks/Grid_wilson.cc +++ b/benchmarks/Grid_wilson.cc @@ -31,11 +31,9 @@ int main (int argc, char ** argv) std::cout << "Grid is setup to use "< seeds({1,2,3,4}); - GridParallelRNG pRNG(&Grid); - // std::vector seeds({1,2,3,4}); - // pRNG.SeedFixedIntegers(seeds); - pRNG.SeedRandomDevice(); + pRNG.SeedFixedIntegers(seeds); + // pRNG.SeedRandomDevice(); LatticeFermion src (&Grid); random(pRNG,src); LatticeFermion result(&Grid); result=zero; @@ -55,8 +53,10 @@ int main (int argc, char ** argv) Complex cone(1.0,0.0); for(int nn=0;nn(Umu,U[nn],nn); } @@ -85,7 +85,7 @@ int main (int argc, char ** argv) WilsonMatrix Dw(Umu,Grid,RBGrid,mass); std::cout << "Calling Dw"< +inline void Gpermute0(vsimd &y,const vsimd &b) { + union { + fvec f; + decltype(vsimd::v) v; + } conv; + conv.v = b.v; +#ifdef SSE4 + conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); +#endif +#if defined(AVX1)||defined(AVX2) + conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); +#endif +#ifdef AVX512 + conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); +#endif + y.v=conv.v; +}; +template +inline void Gpermute1(vsimd &y,const vsimd &b) { + union { + fvec f; + decltype(vsimd::v) v; + } conv; + conv.v = b.v; +#ifdef SSE4 + conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); +#endif +#if defined(AVX1)||defined(AVX2) + conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); +#endif +#ifdef AVX512 + conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); +#endif + y.v=conv.v; +}; +template +inline void Gpermute2(vsimd &y,const vsimd &b) { + union { + fvec f; + decltype(vsimd::v) v; + } conv; + conv.v = b.v; +#ifdef SSE4 +#endif +#if defined(AVX1)||defined(AVX2) + conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); +#endif +#ifdef AVX512 + conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); +#endif + y.v=conv.v; + +}; +template +inline void Gpermute3(vsimd &y,const vsimd &b) { + union { + fvec f; + decltype(vsimd::v) v; + } conv; + conv.v = b.v; +#ifdef AVX512 + conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); +#endif + y.v=conv.v; + +}; + template inline void Gpermute(vsimd &y,const vsimd &b,int perm){ union { @@ -170,36 +238,12 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){ } conv; conv.v = b.v; switch (perm){ -#if defined(AVX1)||defined(AVX2) - // 8x32 bits=>3 permutes - case 2: - conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); - break; - case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; -#endif -#ifdef SSE4 - case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break; -#endif -#ifdef AVX512 - // 16 floats=> permutes - // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo - // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn - // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl - // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh - case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; - case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; - case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; -#endif -#ifdef QPX -#error not implemented -#endif + case 3: Gpermute3(y,b); break; + case 2: Gpermute2(y,b); break; + case 1: Gpermute1(y,b); break; + case 0: Gpermute0(y,b); break; default: assert(0); break; } - y.v=conv.v; - }; }; diff --git a/lib/Makefile.am b/lib/Makefile.am index 82459763..6bb5e187 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -18,6 +18,8 @@ libGrid_a_SOURCES = \ Grid_init.cc \ stencil/Grid_stencil_common.cc \ qcd/Grid_qcd_dirac.cc \ + qcd/Grid_qcd_dhop.cc \ + qcd/Grid_qcd_dhop_hand.cc \ qcd/Grid_qcd_wilson_dop.cc \ algorithms/approx/Zolotarev.cc \ algorithms/approx/Remez.cc \ diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index 1d3b1efb..4a6d3180 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -47,6 +47,11 @@ class LatticeTrinaryExpression :public std::pair >, publ LatticeTrinaryExpression(const std::pair > &arg): std::pair >(arg) {}; }; +void inline conformable(GridBase *lhs,GridBase *rhs) +{ + assert(lhs == rhs); +} + template class Lattice : public LatticeBase { @@ -60,7 +65,8 @@ public: typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef vobj vector_object; - + + //////////////////////////////////////////////////////////////////////////////// // Expression Template closure support //////////////////////////////////////////////////////////////////////////////// @@ -276,17 +282,15 @@ PARALLEL_FOR_LOOP } -#include +#include #define GRID_LATTICE_EXPRESSION_TEMPLATES #ifdef GRID_LATTICE_EXPRESSION_TEMPLATES #include #else #include #endif - #include - #include #include #include diff --git a/lib/lattice/Grid_lattice_conformable.h b/lib/lattice/Grid_lattice_conformable.h index faa8c7a7..a77e57af 100644 --- a/lib/lattice/Grid_lattice_conformable.h +++ b/lib/lattice/Grid_lattice_conformable.h @@ -3,16 +3,11 @@ namespace Grid { - template - void conformable(const Lattice &lhs,const Lattice &rhs) + template void conformable(const Lattice &lhs,const Lattice &rhs) { assert(lhs._grid == rhs._grid); assert(lhs.checkerboard == rhs.checkerboard); } - void inline conformable(const GridBase *lhs,GridBase *rhs) - { - assert(lhs == rhs); - } } #endif diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index 318e18df..9a3f5f6a 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -1,4 +1,3 @@ - #include namespace Grid { @@ -7,15 +6,7 @@ namespace QCD { const std::vector WilsonMatrix::directions ({0,1,2,3, 0, 1, 2, 3}); const std::vector WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1}); - // Should be in header? -const int WilsonMatrix::Xp = 0; -const int WilsonMatrix::Yp = 1; -const int WilsonMatrix::Zp = 2; -const int WilsonMatrix::Tp = 3; -const int WilsonMatrix::Xm = 4; -const int WilsonMatrix::Ym = 5; -const int WilsonMatrix::Zm = 6; -const int WilsonMatrix::Tm = 7; + int WilsonMatrix::HandOptDslash; class WilsonCompressor { public: @@ -39,28 +30,28 @@ const int WilsonMatrix::Tm = 7; mudag=(mu+Nd)%(2*Nd); } switch(mudag) { - case WilsonMatrix::Xp: + case Xp: spProjXp(ret,in); break; - case WilsonMatrix::Yp: + case Yp: spProjYp(ret,in); break; - case WilsonMatrix::Zp: + case Zp: spProjZp(ret,in); break; - case WilsonMatrix::Tp: + case Tp: spProjTp(ret,in); break; - case WilsonMatrix::Xm: + case Xm: spProjXm(ret,in); break; - case WilsonMatrix::Ym: + case Ym: spProjYm(ret,in); break; - case WilsonMatrix::Zm: + case Zm: spProjZm(ret,in); break; - case WilsonMatrix::Tm: + case Tm: spProjTm(ret,in); break; default: @@ -157,316 +148,36 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) MooeeInv(in,out); } -void WilsonMatrix::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, - std::vector > &buf, - int ss,const LatticeFermion &in, LatticeFermion &out) -{ - vHalfSpinColourVector tmp; - vHalfSpinColourVector chi; - vSpinColourVector result; - vHalfSpinColourVector Uchi; - int offset,local,perm, ptype; - - //#define VERBOSE( A) if ( ss<10 ) { std::cout << "site " < > &buf, - int ss,const LatticeFermion &in, LatticeFermion &out) -{ - vHalfSpinColourVector tmp; - vHalfSpinColourVector chi; - vSpinColourVector result; - vHalfSpinColourVector Uchi; - int offset,local,perm, ptype; - - // Xp - offset = st._offsets [Xm][ss]; - local = st._is_local[Xm][ss]; - perm = st._permute[Xm][ss]; - - ptype = st._permute_type[Xm]; - if ( local && perm ) { - spProjXp(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjXp(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Xm),&chi()); - spReconXp(result,Uchi); - - // Yp - offset = st._offsets [Ym][ss]; - local = st._is_local[Ym][ss]; - perm = st._permute[Ym][ss]; - ptype = st._permute_type[Ym]; - if ( local && perm ) { - spProjYp(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjYp(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Ym),&chi()); - accumReconYp(result,Uchi); - - // Zp - offset = st._offsets [Zm][ss]; - local = st._is_local[Zm][ss]; - perm = st._permute[Zm][ss]; - ptype = st._permute_type[Zm]; - if ( local && perm ) { - spProjZp(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjZp(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Zm),&chi()); - accumReconZp(result,Uchi); - - // Tp - offset = st._offsets [Tm][ss]; - local = st._is_local[Tm][ss]; - perm = st._permute[Tm][ss]; - ptype = st._permute_type[Tm]; - if ( local && perm ) { - spProjTp(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjTp(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Tm),&chi()); - accumReconTp(result,Uchi); - - // Xm - offset = st._offsets [Xp][ss]; - local = st._is_local[Xp][ss]; - perm = st._permute[Xp][ss]; - ptype = st._permute_type[Xp]; - - if ( local && perm ) - { - spProjXm(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjXm(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Xp),&chi()); - accumReconXm(result,Uchi); - - // Ym - offset = st._offsets [Yp][ss]; - local = st._is_local[Yp][ss]; - perm = st._permute[Yp][ss]; - ptype = st._permute_type[Yp]; - - if ( local && perm ) { - spProjYm(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjYm(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Yp),&chi()); - accumReconYm(result,Uchi); - - // Zm - offset = st._offsets [Zp][ss]; - local = st._is_local[Zp][ss]; - perm = st._permute[Zp][ss]; - ptype = st._permute_type[Zp]; - if ( local && perm ) { - spProjZm(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjZm(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Zp),&chi()); - accumReconZm(result,Uchi); - - // Tm - offset = st._offsets [Tp][ss]; - local = st._is_local[Tp][ss]; - perm = st._permute[Tp][ss]; - ptype = st._permute_type[Tp]; - if ( local && perm ) { - spProjTm(tmp,in._odata[offset]); - permute(chi,tmp,ptype); - } else if ( local ) { - spProjTm(chi,in._odata[offset]); - } else { - chi=buf[offset]; - } - mult(&Uchi(),&U._odata[ss](Tp),&chi()); - accumReconTm(result,Uchi); - - vstream(out._odata[ss],result); -} - void WilsonMatrix::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U, const LatticeFermion &in, LatticeFermion &out,int dag) { assert((dag==DaggerNo) ||(dag==DaggerYes)); WilsonCompressor compressor(dag); - st.HaloExchange(in,comm_buf,compressor); if ( dag == DaggerYes ) { + if( HandOptDslash ) { PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - DhopSiteDag(st,U,comm_buf,sss,in,out); + for(int sss=0;sssoSites();sss++){ + DiracOptHand::DhopSiteDag(st,U,comm_buf,sss,in,out); + } + } else { +PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + DiracOpt::DhopSiteDag(st,U,comm_buf,sss,in,out); + } } } else { + if( HandOptDslash ) { PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - DhopSite(st,U,comm_buf,sss,in,out); + for(int sss=0;sssoSites();sss++){ + DiracOptHand::DhopSite(st,U,comm_buf,sss,in,out); + } + } else { +PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + DiracOpt::DhopSite(st,U,comm_buf,sss,in,out); + } } } } diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index 96b29cd0..87418603 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -6,10 +6,22 @@ namespace Grid { namespace QCD { + // Should be in header? + const int Xp = 0; + const int Yp = 1; + const int Zp = 2; + const int Tp = 3; + const int Xm = 4; + const int Ym = 5; + const int Zm = 6; + const int Tm = 7; + class WilsonMatrix : public CheckerBoardedSparseMatrixBase { //NB r=1; public: + static int HandOptDslash; + double mass; // GridBase * grid; // Inherited // GridBase * cbgrid; @@ -56,14 +68,6 @@ namespace Grid { void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag); void DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField &U, const LatticeFermion &in, LatticeFermion &out,int dag); - // These ones will need to be package intelligently. WilsonType base class - // for use by DWF etc.. - void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, - std::vector > &buf, - int ss,const LatticeFermion &in, LatticeFermion &out); - void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U, - std::vector > &buf, - int ss,const LatticeFermion &in, LatticeFermion &out); typedef iScalar > matrix; @@ -71,6 +75,31 @@ namespace Grid { }; + class DiracOpt { + public: + // These ones will need to be package intelligently. WilsonType base class + // for use by DWF etc.. + static void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out); + static void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out); + + }; + class DiracOptHand { + public: + // These ones will need to be package intelligently. WilsonType base class + // for use by DWF etc.. + static void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out); + static void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out); + + }; + } } #endif From 5e72e4c0d9690bcf36951d4e29b14b134e80b2d8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 May 2015 19:55:18 +0100 Subject: [PATCH 2/4] Strip out the dslash kernel implementation --- lib/qcd/Grid_qcd_dhop.cc | 309 ++++++++++++++ lib/qcd/Grid_qcd_dhop_hand.cc | 769 ++++++++++++++++++++++++++++++++++ 2 files changed, 1078 insertions(+) create mode 100644 lib/qcd/Grid_qcd_dhop.cc create mode 100644 lib/qcd/Grid_qcd_dhop_hand.cc diff --git a/lib/qcd/Grid_qcd_dhop.cc b/lib/qcd/Grid_qcd_dhop.cc new file mode 100644 index 00000000..1e5dcd16 --- /dev/null +++ b/lib/qcd/Grid_qcd_dhop.cc @@ -0,0 +1,309 @@ +#include + +namespace Grid { +namespace QCD { + +void DiracOpt::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + int offset,local,perm, ptype; + + //#define VERBOSE( A) if ( ss<10 ) { std::cout << "site " < > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + int offset,local,perm, ptype; + + // Xp + offset = st._offsets [Xm][ss]; + local = st._is_local[Xm][ss]; + perm = st._permute[Xm][ss]; + + ptype = st._permute_type[Xm]; + if ( local && perm ) { + spProjXp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Xm),&chi()); + spReconXp(result,Uchi); + + // Yp + offset = st._offsets [Ym][ss]; + local = st._is_local[Ym][ss]; + perm = st._permute[Ym][ss]; + ptype = st._permute_type[Ym]; + if ( local && perm ) { + spProjYp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Ym),&chi()); + accumReconYp(result,Uchi); + + // Zp + offset = st._offsets [Zm][ss]; + local = st._is_local[Zm][ss]; + perm = st._permute[Zm][ss]; + ptype = st._permute_type[Zm]; + if ( local && perm ) { + spProjZp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Zm),&chi()); + accumReconZp(result,Uchi); + + // Tp + offset = st._offsets [Tm][ss]; + local = st._is_local[Tm][ss]; + perm = st._permute[Tm][ss]; + ptype = st._permute_type[Tm]; + if ( local && perm ) { + spProjTp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Tm),&chi()); + accumReconTp(result,Uchi); + + // Xm + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; + + if ( local && perm ) + { + spProjXm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Xp),&chi()); + accumReconXm(result,Uchi); + + // Ym + offset = st._offsets [Yp][ss]; + local = st._is_local[Yp][ss]; + perm = st._permute[Yp][ss]; + ptype = st._permute_type[Yp]; + + if ( local && perm ) { + spProjYm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Yp),&chi()); + accumReconYm(result,Uchi); + + // Zm + offset = st._offsets [Zp][ss]; + local = st._is_local[Zp][ss]; + perm = st._permute[Zp][ss]; + ptype = st._permute_type[Zp]; + if ( local && perm ) { + spProjZm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Zp),&chi()); + accumReconZm(result,Uchi); + + // Tm + offset = st._offsets [Tp][ss]; + local = st._is_local[Tp][ss]; + perm = st._permute[Tp][ss]; + ptype = st._permute_type[Tp]; + if ( local && perm ) { + spProjTm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Tp),&chi()); + accumReconTm(result,Uchi); + + vstream(out._odata[ss],result); +} +}} diff --git a/lib/qcd/Grid_qcd_dhop_hand.cc b/lib/qcd/Grid_qcd_dhop_hand.cc new file mode 100644 index 00000000..f8d464fb --- /dev/null +++ b/lib/qcd/Grid_qcd_dhop_hand.cc @@ -0,0 +1,769 @@ +#include + +#define REGISTER + +#define LOAD_CHIMU \ + const vSpinColourVector & ref (in._odata[offset]); \ + Chimu_00=ref()(0)(0);\ + Chimu_01=ref()(0)(1);\ + Chimu_02=ref()(0)(2);\ + Chimu_10=ref()(1)(0);\ + Chimu_11=ref()(1)(1);\ + Chimu_12=ref()(1)(2);\ + Chimu_20=ref()(2)(0);\ + Chimu_21=ref()(2)(1);\ + Chimu_22=ref()(2)(2);\ + Chimu_30=ref()(3)(0);\ + Chimu_31=ref()(3)(1);\ + Chimu_32=ref()(3)(2); + +#define LOAD_CHI\ + const vHalfSpinColourVector &ref(buf[offset]); \ + Chi_00 = ref()(0)(0);\ + Chi_01 = ref()(0)(1);\ + Chi_02 = ref()(0)(2);\ + Chi_10 = ref()(1)(0);\ + Chi_11 = ref()(1)(1);\ + Chi_12 = ref()(1)(2); + +#define MULT_2SPIN(A)\ + auto & ref(U._odata[ss](A)); \ + U_00 = ref()(0,0);\ + U_10 = ref()(1,0);\ + U_20 = ref()(2,0);\ + U_01 = ref()(0,1);\ + U_11 = ref()(1,1); \ + U_21 = ref()(2,1);\ + UChi_00 = U_00*Chi_00;\ + UChi_10 = U_00*Chi_10;\ + UChi_01 = U_10*Chi_00;\ + UChi_11 = U_10*Chi_10;\ + UChi_02 = U_20*Chi_00;\ + UChi_12 = U_20*Chi_10;\ + UChi_00+= U_01*Chi_01;\ + UChi_10+= U_01*Chi_11;\ + UChi_01+= U_11*Chi_01;\ + UChi_11+= U_11*Chi_11;\ + UChi_02+= U_21*Chi_01;\ + UChi_12+= U_21*Chi_11;\ + U_00 = ref()(0,2);\ + U_10 = ref()(1,2);\ + U_20 = ref()(2,2);\ + UChi_00+= U_00*Chi_02;\ + UChi_10+= U_00*Chi_12;\ + UChi_01+= U_10*Chi_02;\ + UChi_11+= U_10*Chi_12;\ + UChi_02+= U_20*Chi_02;\ + UChi_12+= U_20*Chi_12; + +#define PERMUTE\ + permute(Chi_00,Chi_00,ptype);\ + permute(Chi_01,Chi_01,ptype);\ + permute(Chi_02,Chi_02,ptype);\ + permute(Chi_10,Chi_10,ptype);\ + permute(Chi_11,Chi_11,ptype);\ + permute(Chi_12,Chi_12,ptype); + +// hspin(0)=fspin(0)+timesI(fspin(3)); +// hspin(1)=fspin(1)+timesI(fspin(2)); +#define XP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_30);\ + Chi_01 = Chimu_01+timesI(Chimu_31);\ + Chi_02 = Chimu_02+timesI(Chimu_32);\ + Chi_10 = Chimu_10+timesI(Chimu_20);\ + Chi_11 = Chimu_11+timesI(Chimu_21);\ + Chi_12 = Chimu_12+timesI(Chimu_22); + +#define YP_PROJ \ + Chi_00 = Chimu_00-Chimu_30;\ + Chi_01 = Chimu_01-Chimu_31;\ + Chi_02 = Chimu_02-Chimu_32;\ + Chi_10 = Chimu_10+Chimu_20;\ + Chi_11 = Chimu_11+Chimu_21;\ + Chi_12 = Chimu_12+Chimu_22; + +#define ZP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_20); \ + Chi_01 = Chimu_01+timesI(Chimu_21); \ + Chi_02 = Chimu_02+timesI(Chimu_22); \ + Chi_10 = Chimu_10-timesI(Chimu_30); \ + Chi_11 = Chimu_11-timesI(Chimu_31); \ + Chi_12 = Chimu_12-timesI(Chimu_32); + +#define TP_PROJ \ + Chi_00 = Chimu_00+Chimu_20; \ + Chi_01 = Chimu_01+Chimu_21; \ + Chi_02 = Chimu_02+Chimu_22; \ + Chi_10 = Chimu_10+Chimu_30; \ + Chi_11 = Chimu_11+Chimu_31; \ + Chi_12 = Chimu_12+Chimu_32; + + +// hspin(0)=fspin(0)-timesI(fspin(3)); +// hspin(1)=fspin(1)-timesI(fspin(2)); +#define XM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_30);\ + Chi_01 = Chimu_01-timesI(Chimu_31);\ + Chi_02 = Chimu_02-timesI(Chimu_32);\ + Chi_10 = Chimu_10-timesI(Chimu_20);\ + Chi_11 = Chimu_11-timesI(Chimu_21);\ + Chi_12 = Chimu_12-timesI(Chimu_22); + +#define YM_PROJ \ + Chi_00 = Chimu_00+Chimu_30;\ + Chi_01 = Chimu_01+Chimu_31;\ + Chi_02 = Chimu_02+Chimu_32;\ + Chi_10 = Chimu_10-Chimu_20;\ + Chi_11 = Chimu_11-Chimu_21;\ + Chi_12 = Chimu_12-Chimu_22; + +#define ZM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_20); \ + Chi_01 = Chimu_01-timesI(Chimu_21); \ + Chi_02 = Chimu_02-timesI(Chimu_22); \ + Chi_10 = Chimu_10+timesI(Chimu_30); \ + Chi_11 = Chimu_11+timesI(Chimu_31); \ + Chi_12 = Chimu_12+timesI(Chimu_32); + +#define TM_PROJ \ + Chi_00 = Chimu_00-Chimu_20; \ + Chi_01 = Chimu_01-Chimu_21; \ + Chi_02 = Chimu_02-Chimu_22; \ + Chi_10 = Chimu_10-Chimu_30; \ + Chi_11 = Chimu_11-Chimu_31; \ + Chi_12 = Chimu_12-Chimu_32; + +// fspin(0)=hspin(0); +// fspin(1)=hspin(1); +// fspin(2)=timesMinusI(hspin(1)); +// fspin(3)=timesMinusI(hspin(0)); +#define XP_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesMinusI(UChi_10);\ + result_21 = timesMinusI(UChi_11);\ + result_22 = timesMinusI(UChi_12);\ + result_30 = timesMinusI(UChi_00);\ + result_31 = timesMinusI(UChi_01);\ + result_32 = timesMinusI(UChi_02); + +#define XP_RECON_ACCUM\ + result_00+=UChi_00;\ + result_01+=UChi_01;\ + result_02+=UChi_02;\ + result_10+=UChi_10;\ + result_11+=UChi_11;\ + result_12+=UChi_12;\ + result_20-=timesI(UChi_10);\ + result_21-=timesI(UChi_11);\ + result_22-=timesI(UChi_12);\ + result_30-=timesI(UChi_00);\ + result_31-=timesI(UChi_01);\ + result_32-=timesI(UChi_02); + +#define XM_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesI(UChi_10);\ + result_21 = timesI(UChi_11);\ + result_22 = timesI(UChi_12);\ + result_30 = timesI(UChi_00);\ + result_31 = timesI(UChi_01);\ + result_32 = timesI(UChi_02); + +#define XM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_10);\ + result_21+= timesI(UChi_11);\ + result_22+= timesI(UChi_12);\ + result_30+= timesI(UChi_00);\ + result_31+= timesI(UChi_01);\ + result_32+= timesI(UChi_02); + +#define YP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_10;\ + result_21+= UChi_11;\ + result_22+= UChi_12;\ + result_30-= UChi_00;\ + result_31-= UChi_01;\ + result_32-= UChi_02; + +#define YM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_10;\ + result_21-= UChi_11;\ + result_22-= UChi_12;\ + result_30+= UChi_00;\ + result_31+= UChi_01;\ + result_32+= UChi_02; + +#define ZP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= timesI(UChi_00); \ + result_21-= timesI(UChi_01); \ + result_22-= timesI(UChi_02); \ + result_30+= timesI(UChi_10); \ + result_31+= timesI(UChi_11); \ + result_32+= timesI(UChi_12); + +#define ZM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_00); \ + result_21+= timesI(UChi_01); \ + result_22+= timesI(UChi_02); \ + result_30-= timesI(UChi_10); \ + result_31-= timesI(UChi_11); \ + result_32-= timesI(UChi_12); + +#define TP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_00; \ + result_21+= UChi_01; \ + result_22+= UChi_02; \ + result_30+= UChi_10; \ + result_31+= UChi_11; \ + result_32+= UChi_12; + +#define TM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_00; \ + result_21-= UChi_01; \ + result_22-= UChi_02; \ + result_30-= UChi_10; \ + result_31-= UChi_11; \ + result_32-= UChi_12; + +namespace Grid { +namespace QCD { + +void DiracOptHand::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + REGISTER vComplex result_00; // 12 regs on knc + REGISTER vComplex result_01; + REGISTER vComplex result_02; + + REGISTER vComplex result_10; + REGISTER vComplex result_11; + REGISTER vComplex result_12; + + REGISTER vComplex result_20; + REGISTER vComplex result_21; + REGISTER vComplex result_22; + + REGISTER vComplex result_30; + REGISTER vComplex result_31; + REGISTER vComplex result_32; // 20 left + + REGISTER vComplex Chi_00; // two spinor; 6 regs + REGISTER vComplex Chi_01; + REGISTER vComplex Chi_02; + + REGISTER vComplex Chi_10; + REGISTER vComplex Chi_11; + REGISTER vComplex Chi_12; // 14 left + + REGISTER vComplex UChi_00; // two spinor; 6 regs + REGISTER vComplex UChi_01; + REGISTER vComplex UChi_02; + + REGISTER vComplex UChi_10; + REGISTER vComplex UChi_11; + REGISTER vComplex UChi_12; // 8 left + + REGISTER vComplex U_00; // two rows of U matrix + REGISTER vComplex U_10; + REGISTER vComplex U_20; + REGISTER vComplex U_01; + REGISTER vComplex U_11; + REGISTER vComplex U_21; // 2 reg left. + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + + + int offset,local,perm, ptype; + + // Xp + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; + + if ( local ) { + LOAD_CHIMU; + XP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + + { + MULT_2SPIN(Xp); + } + XP_RECON; + // std::cout << "XP_RECON"< > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + REGISTER vComplex result_00; // 12 regs on knc + REGISTER vComplex result_01; + REGISTER vComplex result_02; + + REGISTER vComplex result_10; + REGISTER vComplex result_11; + REGISTER vComplex result_12; + + REGISTER vComplex result_20; + REGISTER vComplex result_21; + REGISTER vComplex result_22; + + REGISTER vComplex result_30; + REGISTER vComplex result_31; + REGISTER vComplex result_32; // 20 left + + REGISTER vComplex Chi_00; // two spinor; 6 regs + REGISTER vComplex Chi_01; + REGISTER vComplex Chi_02; + + REGISTER vComplex Chi_10; + REGISTER vComplex Chi_11; + REGISTER vComplex Chi_12; // 14 left + + REGISTER vComplex UChi_00; // two spinor; 6 regs + REGISTER vComplex UChi_01; + REGISTER vComplex UChi_02; + + REGISTER vComplex UChi_10; + REGISTER vComplex UChi_11; + REGISTER vComplex UChi_12; // 8 left + + REGISTER vComplex U_00; // two rows of U matrix + REGISTER vComplex U_10; + REGISTER vComplex U_20; + REGISTER vComplex U_01; + REGISTER vComplex U_11; + REGISTER vComplex U_21; // 2 reg left. + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + + + int offset,local,perm, ptype; + + // Xp + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; + + if ( local ) { + LOAD_CHIMU; + XM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Xp); + } + XM_RECON; + + // Yp + offset = st._offsets [Yp][ss]; + local = st._is_local[Yp][ss]; + perm = st._permute[Yp][ss]; + ptype = st._permute_type[Yp]; + + if ( local ) { + LOAD_CHIMU; + YM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Yp); + } + YM_RECON_ACCUM; + + + // Zp + offset = st._offsets [Zp][ss]; + local = st._is_local[Zp][ss]; + perm = st._permute[Zp][ss]; + ptype = st._permute_type[Zp]; + + if ( local ) { + LOAD_CHIMU; + ZM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Zp); + } + ZM_RECON_ACCUM; + + // Tp + offset = st._offsets [Tp][ss]; + local = st._is_local[Tp][ss]; + perm = st._permute[Tp][ss]; + ptype = st._permute_type[Tp]; + + if ( local ) { + LOAD_CHIMU; + TM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Tp); + } + TM_RECON_ACCUM; + + // Xm + offset = st._offsets [Xm][ss]; + local = st._is_local[Xm][ss]; + perm = st._permute[Xm][ss]; + ptype = st._permute_type[Xm]; + + if ( local ) { + LOAD_CHIMU; + XP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Xm); + } + XP_RECON_ACCUM; + + + // Ym + offset = st._offsets [Ym][ss]; + local = st._is_local[Ym][ss]; + perm = st._permute[Ym][ss]; + ptype = st._permute_type[Ym]; + + if ( local ) { + LOAD_CHIMU; + YP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Ym); + } + YP_RECON_ACCUM; + + // Zm + offset = st._offsets [Zm][ss]; + local = st._is_local[Zm][ss]; + perm = st._permute[Zm][ss]; + ptype = st._permute_type[Zm]; + + if ( local ) { + LOAD_CHIMU; + ZP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Zm); + } + ZP_RECON_ACCUM; + + // Tm + offset = st._offsets [Tm][ss]; + local = st._is_local[Tm][ss]; + perm = st._permute[Tm][ss]; + ptype = st._permute_type[Tm]; + + if ( local ) { + LOAD_CHIMU; + TP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Tm); + } + TP_RECON_ACCUM; + + { + vSpinColourVector & ref (out._odata[ss]); + vstream(ref()(0)(0),result_00); + vstream(ref()(0)(1),result_01); + vstream(ref()(0)(2),result_02); + vstream(ref()(1)(0),result_10); + vstream(ref()(1)(1),result_11); + vstream(ref()(1)(2),result_12); + vstream(ref()(2)(0),result_20); + vstream(ref()(2)(1),result_21); + vstream(ref()(2)(2),result_22); + vstream(ref()(3)(0),result_30); + vstream(ref()(3)(1),result_31); + vstream(ref()(3)(2),result_32); + } +} +}} From 6d2e056187f620e90919478e50125ed22da590b8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 May 2015 22:20:09 +0100 Subject: [PATCH 3/4] Simd revert to Guido's commit. I edited concurrently and things went bad. --- lib/Grid_simd.h | 134 +----------------------------------------------- 1 file changed, 1 insertion(+), 133 deletions(-) diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 504e1c17..cccc82e0 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -94,139 +94,7 @@ namespace Grid { template<> inline void zeroit(ComplexD &arg){ arg=0; }; template<> inline void zeroit(RealF &arg){ arg=0; }; template<> inline void zeroit(RealD &arg){ arg=0; }; - -#if defined (SSE4) - typedef __m128 fvec; - typedef __m128d dvec; - typedef __m128 cvec; - typedef __m128d zvec; - typedef __m128i ivec; -#endif -#if defined (AVX1) || defined (AVX2) - typedef __m256 fvec; - typedef __m256d dvec; - typedef __m256 cvec; - typedef __m256d zvec; - typedef __m256i ivec; -#endif -#if defined (AVX512) - typedef __m512 fvec; - typedef __m512d dvec; - typedef __m512 cvec; - typedef __m512d zvec; - typedef __m512i ivec; -#endif -#if defined (QPX) - typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision - typedef float cvec __attribute__ ((vector_size (16))); - - typedef vector4double dvec; - typedef vector4double zvec; -#endif -#if defined (AVX1) || defined (AVX2) || defined (AVX512) - inline void v_prefetch0(int size, const char *ptr){ - for(int i=0;i BA DC FE HG -// Permute 1 every ABCDEFGH -> CD AB GH EF -// Permute 2 every ABCDEFGH -> EFGH ABCD -// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single) -// Permute 4 possible on half precision @512bit vectors. -////////////////////////////////////////////////////////// -template -inline void Gpermute0(vsimd &y,const vsimd &b) { - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; -#ifdef SSE4 - conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); -#endif -#if defined(AVX1)||defined(AVX2) - conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); -#endif -#ifdef AVX512 - conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); -#endif - y.v=conv.v; -}; -template -inline void Gpermute1(vsimd &y,const vsimd &b) { - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; -#ifdef SSE4 - conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); -#endif -#if defined(AVX1)||defined(AVX2) - conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); -#endif -#ifdef AVX512 - conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); -#endif - y.v=conv.v; -}; -template -inline void Gpermute2(vsimd &y,const vsimd &b) { - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; -#ifdef SSE4 -#endif -#if defined(AVX1)||defined(AVX2) - conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); -#endif -#ifdef AVX512 - conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); -#endif - y.v=conv.v; - -}; -template -inline void Gpermute3(vsimd &y,const vsimd &b) { - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; -#ifdef AVX512 - conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); -#endif - y.v=conv.v; - -}; - -template -inline void Gpermute(vsimd &y,const vsimd &b,int perm){ - union { - fvec f; - decltype(vsimd::v) v; - } conv; - conv.v = b.v; - switch (perm){ - case 3: Gpermute3(y,b); break; - case 2: Gpermute2(y,b); break; - case 1: Gpermute1(y,b); break; - case 0: Gpermute0(y,b); break; - default: assert(0); break; - } - }; - + }; #include From b6a28f1de786216b9b6fd026e1d7b4f0145ddb77 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 May 2015 22:20:40 +0100 Subject: [PATCH 4/4] Auto gen files should never have been committed, but making everyone run aclocal, automake, autoconf is a pain in the ass. --- Makefile.in | 44 +++++++++++++++++++++++-------------- aclocal.m4 | 61 +++++++++++++++++++++++++++------------------------- config.guess | 2 +- config.sub | 2 +- configure | 13 +++++------ 5 files changed, 69 insertions(+), 53 deletions(-) diff --git a/Makefile.in b/Makefile.in index d473c2df..a6508b45 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.14.1 from Makefile.am. +# Makefile.in generated by automake 1.15 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2013 Free Software Foundation, Inc. +# Copyright (C) 1994-2014 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -14,7 +14,17 @@ @SET_MAKE@ VPATH = @srcdir@ -am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} am__make_running_with_option = \ case $${target_option-} in \ ?) ;; \ @@ -79,15 +89,13 @@ build_triplet = @build@ host_triplet = @host@ target_triplet = @target@ subdir = . -DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \ - $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ - $(top_srcdir)/configure $(am__configure_deps) COPYING TODO \ - compile config.guess config.sub depcomp install-sh missing ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \ + $(am__configure_deps) $(am__DIST_COMMON) am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ configure.lineno config.status.lineno mkinstalldirs = $(install_sh) -d @@ -150,6 +158,9 @@ ETAGS = etags CTAGS = ctags CSCOPE = cscope DIST_SUBDIRS = $(SUBDIRS) +am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \ + INSTALL NEWS README TODO compile config.guess config.sub \ + depcomp install-sh missing DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) @@ -316,7 +327,6 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu Makefile -.PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ @@ -523,15 +533,15 @@ dist-xz: distdir $(am__post_remove_distdir) dist-tarZ: distdir - @echo WARNING: "Support for shar distribution archives is" \ - "deprecated." >&2 + @echo WARNING: "Support for distribution archives compressed with" \ + "legacy program 'compress' is deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z $(am__post_remove_distdir) dist-shar: distdir - @echo WARNING: "Support for distribution archives compressed with" \ - "legacy program 'compress' is deprecated." >&2 + @echo WARNING: "Support for shar distribution archives is" \ + "deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz $(am__post_remove_distdir) @@ -567,17 +577,17 @@ distcheck: dist esac chmod -R a-w $(distdir) chmod u+w $(distdir) - mkdir $(distdir)/_build $(distdir)/_inst + mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst chmod a-w $(distdir) test -d $(distdir)/_build || exit 0; \ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \ && am__cwd=`pwd` \ - && $(am__cd) $(distdir)/_build \ - && ../configure \ + && $(am__cd) $(distdir)/_build/sub \ + && ../../configure \ $(AM_DISTCHECK_CONFIGURE_FLAGS) \ $(DISTCHECK_CONFIGURE_FLAGS) \ - --srcdir=.. --prefix="$$dc_install_base" \ + --srcdir=../.. --prefix="$$dc_install_base" \ && $(MAKE) $(AM_MAKEFLAGS) \ && $(MAKE) $(AM_MAKEFLAGS) dvi \ && $(MAKE) $(AM_MAKEFLAGS) check \ @@ -751,6 +761,8 @@ uninstall-am: maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ pdf-am ps ps-am tags tags-am uninstall uninstall-am +.PRECIOUS: Makefile + # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/aclocal.m4 b/aclocal.m4 index a3d1bc9c..a358f21e 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,6 +1,6 @@ -# generated automatically by aclocal 1.14.1 -*- Autoconf -*- +# generated automatically by aclocal 1.15 -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to. If you have problems, you may need to regenerate the build system entirely. To do so, use the procedure documented by the package, typically 'autoreconf'.])]) -# Copyright (C) 2002-2013 Free Software Foundation, Inc. +# Copyright (C) 2002-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.]) # generated from the m4 files accompanying Automake X.Y. # (This private macro should not be called outside this file.) AC_DEFUN([AM_AUTOMAKE_VERSION], -[am__api_version='1.14' +[am__api_version='1.15' dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.14.1], [], +m4_if([$1], [1.15], [], [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl ]) @@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], []) # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.14.1])dnl +[AM_AUTOMAKE_VERSION([1.15])dnl m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # AM_AUX_DIR_EXPAND -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # configured tree to be moved without reconfiguration. AC_DEFUN([AM_AUX_DIR_EXPAND], -[dnl Rely on autoconf to set up CDPATH properly. -AC_PREREQ([2.50])dnl -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` +[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl +# Expand $ac_aux_dir to an absolute path. +am_aux_dir=`cd "$ac_aux_dir" && pwd` ]) # AM_CONDITIONAL -*- Autoconf -*- -# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# Copyright (C) 1997-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE( Usually this means the macro was only invoked conditionally.]]) fi])]) -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl # Generate code to set up dependency tracking. -*- Autoconf -*- -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], # Do all the work for Automake. -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl # # AC_SUBST([mkdir_p], ['$(MKDIR_P)']) -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. +# We need awk for the "check" target (and possibly the TAP driver). The +# system "awk" is bad on some platforms. AC_REQUIRE([AC_PROG_AWK])dnl AC_REQUIRE([AC_PROG_MAKE_SET])dnl AC_REQUIRE([AM_SET_LEADING_DOT])dnl @@ -573,7 +572,11 @@ to "yes", and re-run configure. END AC_MSG_ERROR([Your 'rm' program is bad, sorry.]) fi -fi]) +fi +dnl The trailing newline in this macro's definition is deliberate, for +dnl backward compatibility and to allow trailing 'dnl'-style comments +dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841. +]) dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further @@ -602,7 +605,7 @@ for _am_header in $config_headers :; do done echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co # Define $install_sh. AC_DEFUN([AM_PROG_INSTALL_SH], [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -if test x"${install_sh}" != xset; then +if test x"${install_sh+set}" != xset; then case $am_aux_dir in *\ * | *\ *) install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; @@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then fi AC_SUBST([install_sh])]) -# Copyright (C) 2003-2013 Free Software Foundation, Inc. +# Copyright (C) 2003-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])]) # Check to see how 'make' treats includes. -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -694,7 +697,7 @@ rm -f confinc confmf # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- -# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# Copyright (C) 1997-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -733,7 +736,7 @@ fi # Helper functions for option handling. -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION], # Check to make sure that the build environment is sane. -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE( rm -f conftest.file ]) -# Copyright (C) 2009-2013 Free Software Foundation, Inc. +# Copyright (C) 2009-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl ]) -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -933,7 +936,7 @@ fi INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) -# Copyright (C) 2006-2013 Free Software Foundation, Inc. +# Copyright (C) 2006-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) # Check how to create a tarball. -*- Autoconf -*- -# Copyright (C) 2004-2013 Free Software Foundation, Inc. +# Copyright (C) 2004-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, diff --git a/config.guess b/config.guess index 5f6aa02d..a12faba2 120000 --- a/config.guess +++ b/config.guess @@ -1 +1 @@ -/usr/share/automake-1.14/config.guess \ No newline at end of file +/opt/local/share/automake-1.15/config.guess \ No newline at end of file diff --git a/config.sub b/config.sub index 0abfe18c..e3c9b5ca 120000 --- a/config.sub +++ b/config.sub @@ -1 +1 @@ -/usr/share/automake-1.14/config.sub \ No newline at end of file +/opt/local/share/automake-1.15/config.sub \ No newline at end of file diff --git a/configure b/configure index b7bd49f0..6681b765 100755 --- a/configure +++ b/configure @@ -2467,7 +2467,7 @@ test -n "$target_alias" && NONENONEs,x,x, && program_prefix=${target_alias}- -am__api_version='1.14' +am__api_version='1.15' # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or @@ -2639,8 +2639,8 @@ test "$program_suffix" != NONE && ac_script='s/[\\$]/&&/g;s/;s,x,x,$//' program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` +# Expand $ac_aux_dir to an absolute path. +am_aux_dir=`cd "$ac_aux_dir" && pwd` if test x"${MISSING+set}" != xset; then case $am_aux_dir in @@ -2659,7 +2659,7 @@ else $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;} fi -if test x"${install_sh}" != xset; then +if test x"${install_sh+set}" != xset; then case $am_aux_dir in *\ * | *\ *) install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; @@ -2987,8 +2987,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} # mkdir_p='$(MKDIR_P)' -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. +# We need awk for the "check" target (and possibly the TAP driver). The +# system "awk" is bad on some platforms. # Always define AMTAR for backward compatibility. Yes, it's still used # in the wild :-( We should find a proper way to deprecate it ... AMTAR='$${TAR-tar}' @@ -3047,6 +3047,7 @@ END fi + ac_config_headers="$ac_config_headers lib/Grid_config.h" # Check whether --enable-silent-rules was given.