diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 36360102..e2729187 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -213,6 +213,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m256d operator()(__m256d a, __m256d b){ + __m256d ymm0; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m256 operator()(__m256 a, __m256 b, __m256 c){ + __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, + _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); + } + inline __m256d operator()(__m256d a, __m256d b, __m256d c){ + __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); + return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ @@ -627,7 +650,9 @@ namespace Optimization { typedef Optimization::Sub SubSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index d6531d57..ebf99e16 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -189,6 +189,29 @@ namespace Optimization { // 2mul,4 mac +add+sub = 8 flop type insns // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. + struct MultRealPart{ + inline __m512 operator()(__m512 a, __m512 b){ + __m512 ymm0; + ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m512d operator()(__m512d a, __m512d b){ + __m512d ymm0; + ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m512 operator()(__m512 a, __m512 b, __m512 c){ + __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_fmadd_ps( ymm0, b, c); + } + inline __m512d operator()(__m512d a, __m512d b, __m512d c){ + __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 ); + return _mm512_fmadd_pd( ymm0, b, c); + } + }; + struct MultComplex{ // Complex float inline __m512 operator()(__m512 a, __m512 b){ @@ -501,6 +524,8 @@ namespace Optimization { typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 62c78afb..91e9cda2 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -224,6 +224,21 @@ namespace Optimization { #define cmul(a, b, c, i)\ c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; + + struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + out.v[2*i] = a[2*i]*b[2*i]; + out.v[2*i+1] = a[2*i]*b[2*i+1]; + } + return out; + }; + }; + struct MultComplex{ // Complex @@ -456,6 +471,7 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index bc86291d..99a9ea68 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -220,6 +220,14 @@ namespace Optimization { } }; + struct MultRealPart{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ + // return vec_xmul(b, a); + return vec_xmul(a, b); + } + FLOAT_WRAP_2(operator(), inline) + }; struct MultComplex{ // Complex double inline vector4double operator()(vector4double a, vector4double b){ @@ -430,6 +438,7 @@ typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 560eda11..abd688ab 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -177,6 +177,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0; + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m128 operator()(__m128 a, __m128 b, __m128 c){ + __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + _mm_add_ps(_mm_mul_ps( ymm0, b),c); + } + inline __m128d operator()(__m128d a, __m128d b, __m128 c){ + __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); + return _mm_add_pd(_mm_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m128 operator()(__m128 a, __m128 b){ @@ -415,6 +438,8 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 42f28b34..8a6ab2e7 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -101,6 +101,11 @@ template using IfNotInteger = Invoke +Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { + return op(src_1, src_2, src_3); +} + template Out binary(Input1 src_1, Input2 src_2, Operation op) { return op(src_1, src_2); @@ -178,6 +183,7 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) * (*r); } + friend inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -188,7 +194,6 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) + (*r); } - friend inline void mac(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -260,7 +265,7 @@ class Grid_simd { } //////////////////////////// - // opreator scalar * simd + // operator scalar * simd //////////////////////////// friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { Grid_simd va; @@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); } +template =0> +inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + ret.v = unary(real(typepun[lane]), VsplatSIMD()); +} /////////////////////// // Splat @@ -449,6 +459,10 @@ template inline void vsplat(Grid_simd &ret, EnableIf, S> c) { vsplat(ret, real(c), imag(c)); } +template +inline void rsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} // if real fill with a, if complex fill with a in the real part (first function // above) @@ -550,6 +564,21 @@ inline Grid_simd operator-(Grid_simd a, Grid_simd b) { return ret; }; +// Distinguish between complex types and others +template = 0> +inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; +template = 0> +inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); + return ret; +}; + + // Distinguish between complex types and others template = 0> inline Grid_simd operator*(Grid_simd a, Grid_simd b) { diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 19157db4..7b5964ad 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -95,10 +95,14 @@ Author: paboyle #define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2 #define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n" -#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" -#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" -#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" -#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" +#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" +#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" +#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) +#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) #define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" #define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" @@ -106,11 +110,15 @@ Author: paboyle #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n" +#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" +#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index cfa20c26..e69e541c 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -87,7 +87,8 @@ Author: paboyle VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI2d(A,ACC,tmp) -#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); +#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A +#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr)); #define LOAD64(A,ptr) LOAD64i(A,ptr) #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" @@ -108,8 +109,8 @@ Author: paboyle //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" // "clevict0 "#O"*64("#A");\n" -#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n" -#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n" #define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n" #define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n" @@ -143,8 +144,8 @@ Author: paboyle #define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #else -#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #endif // Swaps Re/Im ; could unify this with IMCI diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index 224c593d..632b5639 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum) #define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum) diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index 3fa47668..ed135651 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)