1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 07:17:06 +01:00

SIMD improvements for mac and madd use in complex for avx, sse

This commit is contained in:
Peter Boyle
2015-10-09 00:38:52 +02:00
parent af89c40462
commit 814c79f38d
7 changed files with 100 additions and 3 deletions

View File

@ -132,7 +132,7 @@ namespace Optimization {
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
#if defined (AVX1) || defined (AVXFMA4)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
@ -161,7 +161,7 @@ namespace Optimization {
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
#if defined (AVX1) || defined (AVXFMA4)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
@ -182,6 +182,7 @@ namespace Optimization {
struct MultComplex{
// Complex float
inline __m256 operator()(__m256 a, __m256 b){
#if defined (AVX1) || defined (AVXFMA4)
__m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
@ -190,6 +191,19 @@ namespace Optimization {
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_ps(ymm0,ymm1);
#endif
#if defined (AVXFMA4)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
// Complex double
inline __m256d operator()(__m256d a, __m256d b){
@ -215,6 +229,7 @@ namespace Optimization {
IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
*/
#if defined (AVX1) || defined (AVXFMA4)
__m256d ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
@ -222,10 +237,48 @@ namespace Optimization {
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_pd(ymm0,ymm1);
#endif
#if defined (AVXFMA4)
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2)
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
};
struct Mult{
inline void mac(__m256 &a, __m256 b, __m256 c){
#if defined (AVX1)
a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
#endif
#if defined (AVXFMA4)
a= _mm256_macc_ps(b,c,a);
#endif
#if defined (AVX2)
a= _mm256_fmadd_ps( b, c, a);
#endif
}
inline void mac(__m256d &a, __m256d b, __m256d c){
#if defined (AVX1)
a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
#endif
#if defined (AVXFMA4)
a= _mm256_macc_pd(b,c,a);
#endif
#if defined (AVX2)
a= _mm256_fmadd_pd( b, c, a);
#endif
}
// Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_mul_ps(a,b);