diff --git a/lib/Algorithms.h b/lib/Algorithms.h index a77e50bc..02d72cb1 100644 --- a/lib/Algorithms.h +++ b/lib/Algorithms.h @@ -18,8 +18,8 @@ #include // Lanczos support -#include -#include +//#include +//#include #include diff --git a/lib/Init.cc b/lib/Init.cc index 6457b4b2..a6f016b1 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -178,6 +178,7 @@ void Grid_init(int *argc,char ***argv) } if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){ QCD::WilsonFermionStatic::HandOptDslash=1; + QCD::WilsonFermion5DStatic::HandOptDslash=1; } if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ LebesgueOrder::UseLebesgueOrder=1; diff --git a/lib/Simd.h b/lib/Simd.h index 7da0d7ad..42d5e74c 100644 --- a/lib/Simd.h +++ b/lib/Simd.h @@ -13,6 +13,11 @@ typedef uint32_t Integer; +#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D)) +#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H)) +#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D) +#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B) + namespace Grid { typedef float RealF; diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index b923fe4f..fee1b0fb 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -56,13 +56,13 @@ UChi_02+= U_20*Chi_02;\ UChi_12+= U_20*Chi_12; -#define PERMUTE\ - permute(Chi_00,Chi_00,ptype);\ - permute(Chi_01,Chi_01,ptype);\ - permute(Chi_02,Chi_02,ptype);\ - permute(Chi_10,Chi_10,ptype);\ - permute(Chi_11,Chi_11,ptype);\ - permute(Chi_12,Chi_12,ptype); +#define PERMUTE_DIR(dir) \ + permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_01,Chi_01);\ + permute##dir(Chi_02,Chi_02);\ + permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_11,Chi_11);\ + permute##dir(Chi_12,Chi_12); // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); @@ -286,6 +286,10 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out) { + // std::cout << "Hand op Dhop "<::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; XP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -373,7 +377,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; YP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -394,7 +398,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; ZP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -414,7 +418,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; TP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -434,7 +438,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; XM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -454,7 +458,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; YM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -474,7 +478,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; ZM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -494,7 +498,7 @@ void WilsonKernels::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug LOAD_CHIMU; TM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -526,6 +530,9 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out) { + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + REGISTER Simd result_00; // 12 regs on knc REGISTER Simd result_01; REGISTER Simd result_02; @@ -592,7 +599,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; XM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -612,7 +619,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; YM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -633,7 +640,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; ZM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -653,7 +660,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; TM_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -673,7 +680,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; XP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -694,7 +701,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; YP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -714,7 +721,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; ZP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; @@ -734,7 +741,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledG LOAD_CHIMU; TP_PROJ; if ( perm) { - PERMUTE; + PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } } else { LOAD_CHI; diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index a00e29eb..217ea947 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -183,11 +183,11 @@ namespace Optimization { // Complex float inline __m256 operator()(__m256 a, __m256 b){ __m256 ymm0,ymm1,ymm2; - ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br // FIXME AVX2 could MAC - ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi - ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi return _mm256_addsub_ps(ymm0,ymm1); } @@ -270,7 +270,7 @@ namespace Optimization { //Complex single inline __m256 operator()(__m256 in, __m256 ret){ __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i - return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r + return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r } //Complex double inline __m256d operator()(__m256d in, __m256d ret){ @@ -282,7 +282,7 @@ namespace Optimization { struct TimesI{ //Complex single inline __m256 operator()(__m256 in, __m256 ret){ - __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r + __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r } //Complex double @@ -296,27 +296,44 @@ namespace Optimization { // Some Template specialization ////////////////////////////////////////////// - template < typename vtype > - void permute(vtype &a,vtype b, int perm) { - uconv conv; - conv.v = b; - switch (perm){ - // 8x32 bits=>3 permutes - case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break; - default: assert(0); break; - } - a = conv.v; - } + struct Permute{ + + static inline __m256 Permute0(__m256 in){ + return _mm256_permute2f128_ps(in,in,0x01); + }; + static inline __m256 Permute1(__m256 in){ + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m256 Permute2(__m256 in){ + return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m256 Permute3(__m256 in){ + return in; + }; + + static inline __m256d Permute0(__m256d in){ + return _mm256_permute2f128_pd(in,in,0x01); + }; + static inline __m256d Permute1(__m256d in){ + return _mm256_shuffle_pd(in,in,0x5); + }; + static inline __m256d Permute2(__m256d in){ + return in; + }; + static inline __m256d Permute3(__m256d in){ + return in; + }; + + }; + //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(__m256 in){ __m256 v1,v2; - Optimization::permute(v1,in,0); // avx 256; quad complex single - v1 = _mm256_add_ps(v1,in); - Optimization::permute(v2,v1,1); + v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single + v1= _mm256_add_ps(v1,in); + v2=Optimization::Permute::Permute1(v1); v1 = _mm256_add_ps(v1,v2); u256f conv; conv.v = v1; return Grid::ComplexF(conv.f[0],conv.f[1]); @@ -326,11 +343,11 @@ namespace Optimization { template<> inline Grid::RealF Reduce::operator()(__m256 in){ __m256 v1,v2; - Optimization::permute(v1,in,0); // avx 256; octo-double + v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double v1 = _mm256_add_ps(v1,in); - Optimization::permute(v2,v1,1); + v2 = Optimization::Permute::Permute1(v1); v1 = _mm256_add_ps(v1,v2); - Optimization::permute(v2,v1,2); + v2 = Optimization::Permute::Permute2(v1); v1 = _mm256_add_ps(v1,v2); u256f conv; conv.v=v1; return conv.f[0]; @@ -341,7 +358,7 @@ namespace Optimization { template<> inline Grid::ComplexD Reduce::operator()(__m256d in){ __m256d v1; - Optimization::permute(v1,in,0); // sse 128; paired complex single + v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single v1 = _mm256_add_pd(v1,in); u256d conv; conv.v = v1; return Grid::ComplexD(conv.f[0],conv.f[1]); @@ -351,9 +368,9 @@ namespace Optimization { template<> inline Grid::RealD Reduce::operator()(__m256d in){ __m256d v1,v2; - Optimization::permute(v1,in,0); // avx 256; quad double + v1 = Optimization::Permute::Permute0(in); // avx 256; quad double v1 = _mm256_add_pd(v1,in); - Optimization::permute(v2,v1,1); + v2 = Optimization::Permute::Permute1(v1); v1 = _mm256_add_pd(v1,v2); u256d conv; conv.v = v1; return conv.f[0]; @@ -387,13 +404,6 @@ namespace Grid { _mm_prefetch(ptr,_MM_HINT_T0); } - - - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - Optimization::permute(y.v,b.v,perm); - }; - // Function name aliases typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vstore VstoreSIMD; diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 7c1800db..7da71899 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -174,7 +174,7 @@ namespace Optimization { inline __m512d operator()(__m512d a, __m512d b){ __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 ); __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF ); - a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); + a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) ); return _mm512_fmaddsub_pd( a_real, b, a_imag ); } }; @@ -211,26 +211,24 @@ namespace Optimization { //Complex single inline __m512 operator()(__m512 in, __m512 ret){ __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag - return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); + return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E?? } //Complex double inline __m512d operator()(__m512d in, __m512d ret){ __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag - return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); - } - - + return _mm512_shuffle_pd(tmp,tmp,0x55); + } }; struct TimesI{ //Complex single inline __m512 operator()(__m512 in, __m512 ret){ - __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); + __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); } //Complex double inline __m512d operator()(__m512d in, __m512d ret){ - __m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2)); + __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55); return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); } @@ -239,6 +237,36 @@ namespace Optimization { + // Gpermute utilities consider coalescing into 1 Gpermute + struct Permute{ + + static inline __m512 Permute0(__m512 in){ + return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512 Permute1(__m512 in){ + return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m512 Permute2(__m512 in){ + return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512 Permute3(__m512 in){ + return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + + static inline __m512d Permute0(__m512d in){ + return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512d Permute1(__m512d in){ + return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m512d Permute2(__m512d in){ + return _mm512_shuffle_pd(in,in,0x55); + }; + static inline __m512d Permute3(__m512d in){ + return in; + }; + + }; ////////////////////////////////////////////// @@ -298,25 +326,6 @@ namespace Grid { } - - - // Gpermute utilities consider coalescing into 1 Gpermute - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - union { - __m512 f; - decltype(VectorSIMD::v) v; - } conv; - conv.v = b.v; - switch(perm){ - case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; - case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; - default: assert(0); break; - } - y.v=conv.v; - }; // Function name aliases typedef Optimization::Vsplat VsplatSIMD; diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index c89e8830..63765a47 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -255,7 +255,36 @@ namespace Optimization { }; - + struct Permute{ + + static inline __m512 Permute0(__m512 in){ + return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512 Permute1(__m512 in){ + return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m512 Permute2(__m512 in){ + return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC); + }; + static inline __m512 Permute3(__m512 in){ + return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); + }; + + static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d + return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m512d Permute1(__m512d in){ + return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC); + }; + static inline __m512d Permute2(__m512d in){ + return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB); + }; + static inline __m512d Permute3(__m512d in){ + return in; + }; + + }; + ////////////////////////////////////////////// @@ -315,25 +344,6 @@ namespace Grid { } - - - // Gpermute utilities consider coalescing into 1 Gpermute - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - union { - __m512 f; - decltype(VectorSIMD::v) v; - } conv; - conv.v = b.v; - switch(perm){ - case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break; - case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; - case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; - case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break; - default: assert(0); break; - } - y.v=conv.v; - }; // Function name aliases typedef Optimization::Vsplat VsplatSIMD; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 62516201..ecac68c4 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -151,10 +151,10 @@ namespace Optimization { // Complex float inline __m128 operator()(__m128 a, __m128 b){ __m128 ymm0,ymm1,ymm2; - ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br - ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi - ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi return _mm_addsub_ps(ymm0,ymm1); } @@ -201,7 +201,7 @@ namespace Optimization { //Complex single inline __m128 operator()(__m128 in, __m128 ret){ __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i - return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); + return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); } //Complex double inline __m128d operator()(__m128d in, __m128d ret){ @@ -215,7 +215,7 @@ namespace Optimization { struct TimesI{ //Complex single inline __m128 operator()(__m128 in, __m128 ret){ - __m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); + __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i } //Complex double @@ -225,27 +225,45 @@ namespace Optimization { } }; + struct Permute{ + + static inline __m128 Permute0(__m128 in){ + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); + }; + static inline __m128 Permute1(__m128 in){ + return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); + }; + static inline __m128 Permute2(__m128 in){ + return in; + }; + static inline __m128 Permute3(__m128 in){ + return in; + }; + + static inline __m128d Permute0(__m128d in){ + return _mm_shuffle_pd(in,in,0x1); + }; + static inline __m128d Permute1(__m128d in){ + return in; + }; + static inline __m128d Permute2(__m128d in){ + return in; + }; + static inline __m128d Permute3(__m128d in){ + return in; + }; + + }; + ////////////////////////////////////////////// // Some Template specialization - template < typename vtype > - void permute(vtype &a, vtype b, int perm) { - uconv conv; - conv.v = b; - switch(perm){ - case 3: break; //empty for SSE4 - case 2: break; //empty for SSE4 - case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break; - case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break; - default: assert(0); break; - } - a=conv.v; - }; + //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(__m128 in){ __m128 v1; // two complex - Optimization::permute(v1,in,0); + v1= Optimization::Permute::Permute0(in); v1= _mm_add_ps(v1,in); u128f conv; conv.v=v1; return Grid::ComplexF(conv.f[0],conv.f[1]); @@ -254,9 +272,9 @@ namespace Optimization { template<> inline Grid::RealF Reduce::operator()(__m128 in){ __m128 v1,v2; // quad single - Optimization::permute(v1,in,0); + v1= Optimization::Permute::Permute0(in); v1= _mm_add_ps(v1,in); - Optimization::permute(v2,v1,1); + v2= Optimization::Permute::Permute1(v1); v1 = _mm_add_ps(v1,v2); u128f conv; conv.v=v1; return conv.f[0]; @@ -274,7 +292,7 @@ namespace Optimization { template<> inline Grid::RealD Reduce::operator()(__m128d in){ __m128d v1; - Optimization::permute(v1,in,0); // avx 256; quad double + v1 = Optimization::Permute::Permute0(in); v1 = _mm_add_pd(v1,in); u128d conv; conv.v = v1; return conv.f[0]; @@ -302,14 +320,6 @@ namespace Grid { inline void prefetch_HINT_T0(const char *ptr){ _mm_prefetch(ptr,_MM_HINT_T0); } - - - // Gpermute function - template < typename VectorSIMD > - inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { - Optimization::permute(y.v,b.v,perm); - } - // Function name aliases typedef Optimization::Vsplat VsplatSIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 16308f53..6d8d79b8 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -251,15 +251,30 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// + friend inline void permute0(Grid_simd &y,Grid_simd b){ + y.v = Optimization::Permute::Permute0(b.v); + } + friend inline void permute1(Grid_simd &y,Grid_simd b){ + y.v = Optimization::Permute::Permute1(b.v); + } + friend inline void permute2(Grid_simd &y,Grid_simd b){ + y.v = Optimization::Permute::Permute2(b.v); + } + friend inline void permute3(Grid_simd &y,Grid_simd b){ + y.v = Optimization::Permute::Permute3(b.v); + } friend inline void permute(Grid_simd &y,Grid_simd b,int perm) { - Gpermute(y,b,perm); + if (perm==3) permute3(y,b); + else if (perm==2) permute2(y,b); + else if (perm==1) permute1(y,b); + else if (perm==0) permute0(y,b); } + };// end of Grid_simd class definition - /////////////////////// // Splat /////////////////////// diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index 4f741f1d..bed4b11b 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -177,6 +177,7 @@ public: permute(out._internal[i],in._internal[i],permutetype); } } + // Unary negation friend strong_inline iVector operator -(const iVector &r) { iVector ret; @@ -290,12 +291,15 @@ public: vstream(out._internal[i][j],in._internal[i][j]); }} } + friend strong_inline void permute(iMatrix &out,const iMatrix &in,int permutetype){ for(int i=0;i operator -(const iMatrix &r) { iMatrix ret; diff --git a/scripts/configure-commands b/scripts/configure-commands index 0dbd5438..07506d27 100755 --- a/scripts/configure-commands +++ b/scripts/configure-commands @@ -35,10 +35,10 @@ icpc-avx-openmp-mpi) CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi ;; icpc-avx-openmp) -CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi +CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi ;; icpc-avx2) - CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none + CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none ;; icpc-avx512) CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none @@ -50,7 +50,7 @@ icpc-mic-avx512) CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none ;; clang-sse) -CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none +CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none ;; clang-avx) CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none