mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Merge branch 'master' of github.com:paboyle/Grid
Conflicts: lib/simd/Grid_avx512.h lib/simd/Grid_imci.h
This commit is contained in:
@ -8,6 +8,9 @@
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef AVXFMA4
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
@ -132,7 +135,7 @@ namespace Optimization {
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1)
|
||||
#if defined (AVX1) || defined (AVXFMA4)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
@ -146,7 +149,6 @@ namespace Optimization {
|
||||
#if defined (AVX2)
|
||||
return _mm256_add_epi32(a,b);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
@ -161,7 +163,7 @@ namespace Optimization {
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1)
|
||||
#if defined (AVX1) || defined (AVXFMA4)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
@ -182,6 +184,7 @@ namespace Optimization {
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
#if defined (AVX1)
|
||||
__m256 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
@ -190,6 +193,20 @@ namespace Optimization {
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_ps(ymm0,ymm1);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
__m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
|
||||
__m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
|
||||
__m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
||||
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
@ -215,6 +232,7 @@ namespace Optimization {
|
||||
IF IMM0[3] = 0
|
||||
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
|
||||
*/
|
||||
#if defined (AVX1)
|
||||
__m256d ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
@ -222,10 +240,71 @@ namespace Optimization {
|
||||
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
|
||||
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_pd(ymm0,ymm1);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
__m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
|
||||
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
|
||||
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
|
||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
#if 0
|
||||
struct ComplexDot {
|
||||
|
||||
inline void Prep(__m256 ari,__m256 &air) {
|
||||
cdotRIperm(ari,air);
|
||||
}
|
||||
inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
|
||||
riir=air*b;
|
||||
iirr=arr*b;
|
||||
};
|
||||
inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
|
||||
mac(riir,air,b);
|
||||
mac(iirr,ari,b);
|
||||
}
|
||||
inline void End(__m256 ari,__m256 &air) {
|
||||
// cdotRI
|
||||
}
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
struct Mult{
|
||||
|
||||
inline void mac(__m256 &a, __m256 b, __m256 c){
|
||||
#if defined (AVX1)
|
||||
a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
a= _mm256_macc_ps(b,c,a);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
a= _mm256_fmadd_ps( b, c, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void mac(__m256d &a, __m256d b, __m256d c){
|
||||
#if defined (AVX1)
|
||||
a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
|
||||
#endif
|
||||
#if defined (AVXFMA4)
|
||||
a= _mm256_macc_pd(b,c,a);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
a= _mm256_fmadd_pd( b, c, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_mul_ps(a,b);
|
||||
|
@ -157,6 +157,12 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
inline float mac(float a, float b,double c){
|
||||
return 0;
|
||||
}
|
||||
inline double mac(double a, double b,double c){
|
||||
return 0;
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float a, float b){
|
||||
return 0;
|
||||
|
@ -171,6 +171,12 @@ namespace Optimization {
|
||||
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||
return vaddq_f32(vmulq_f32(b,c),a);
|
||||
}
|
||||
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
|
||||
return vaddq_f64(vmulq_f64(b,c),a);
|
||||
}
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vmulq_f32(a,b);
|
||||
}
|
||||
|
@ -171,6 +171,15 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
|
||||
inline void mac(__m128 &a, __m128 b, __m128 c){
|
||||
a= _mm128_add_ps(_mm128_mul_ps(b,c),a);
|
||||
}
|
||||
|
||||
inline void mac(__m128d &a, __m128d b, __m128d c){
|
||||
a= _mm128_add_pd(_mm128_mul_pd(b,c),a);
|
||||
}
|
||||
|
||||
// Real float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
return _mm_mul_ps(a,b);
|
||||
|
@ -13,7 +13,7 @@
|
||||
#ifdef SSE4
|
||||
#include "Grid_sse4.h"
|
||||
#endif
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
#if defined (AVX1)|| defined (AVX2) || defined (AVXFMA4)
|
||||
#include "Grid_avx.h"
|
||||
#endif
|
||||
#if defined AVX512
|
||||
@ -133,7 +133,11 @@ namespace Grid {
|
||||
///////////////////////////////////////////////
|
||||
// mac, mult, sub, add, adj
|
||||
///////////////////////////////////////////////
|
||||
|
||||
// FIXME -- alias this to an inline MAC struct.
|
||||
friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
|
||||
|
||||
friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
Reference in New Issue
Block a user