1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-16 23:07:05 +01:00

Updating to modify non-inlining permute routines and hopefully get better reg use and

enhance performance.
This commit is contained in:
Peter Boyle
2015-09-25 08:55:04 -07:00
parent 5ef42add2d
commit 64d64d1ab6
11 changed files with 212 additions and 141 deletions

View File

@ -174,7 +174,7 @@ namespace Optimization {
inline __m512d operator()(__m512d a, __m512d b){
__m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
__m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
return _mm512_fmaddsub_pd( a_real, b, a_imag );
}
};
@ -211,26 +211,24 @@ namespace Optimization {
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
}
return _mm512_shuffle_pd(tmp,tmp,0x55);
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
}
@ -239,6 +237,36 @@ namespace Optimization {
// Gpermute utilities consider coalescing into 1 Gpermute
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute3(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute0(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute2(__m512d in){
return _mm512_shuffle_pd(in,in,0x55);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
//////////////////////////////////////////////
@ -298,25 +326,6 @@ namespace Grid {
}
// Gpermute utilities consider coalescing into 1 Gpermute
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
union {
__m512 f;
decltype(VectorSIMD::v) v;
} conv;
conv.v = b.v;
switch(perm){
case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
y.v=conv.v;
};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;