1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-10 03:17:07 +01:00

Updating to modify non-inlining permute routines and hopefully get better reg use and

enhance performance.
This commit is contained in:
Peter Boyle
2015-09-25 08:55:04 -07:00
parent 5ef42add2d
commit 64d64d1ab6
11 changed files with 212 additions and 141 deletions

View File

@ -183,11 +183,11 @@ namespace Optimization {
// Complex float
inline __m256 operator()(__m256 a, __m256 b){
__m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
// FIXME AVX2 could MAC
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_ps(ymm0,ymm1);
}
@ -270,7 +270,7 @@ namespace Optimization {
//Complex single
inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
}
//Complex double
inline __m256d operator()(__m256d in, __m256d ret){
@ -282,7 +282,7 @@ namespace Optimization {
struct TimesI{
//Complex single
inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
}
//Complex double
@ -296,27 +296,44 @@ namespace Optimization {
// Some Template specialization
//////////////////////////////////////////////
template < typename vtype >
void permute(vtype &a,vtype b, int perm) {
uconv<vtype> conv;
conv.v = b;
switch (perm){
// 8x32 bits=>3 permutes
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
default: assert(0); break;
}
a = conv.v;
}
struct Permute{
static inline __m256 Permute0(__m256 in){
return _mm256_permute2f128_ps(in,in,0x01);
};
static inline __m256 Permute1(__m256 in){
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m256 Permute2(__m256 in){
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m256 Permute3(__m256 in){
return in;
};
static inline __m256d Permute0(__m256d in){
return _mm256_permute2f128_pd(in,in,0x01);
};
static inline __m256d Permute1(__m256d in){
return _mm256_shuffle_pd(in,in,0x5);
};
static inline __m256d Permute2(__m256d in){
return in;
};
static inline __m256d Permute3(__m256d in){
return in;
};
};
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
__m256 v1,v2;
Optimization::permute(v1,in,0); // avx 256; quad complex single
v1 = _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1);
v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
v1= _mm256_add_ps(v1,in);
v2=Optimization::Permute::Permute1(v1);
v1 = _mm256_add_ps(v1,v2);
u256f conv; conv.v = v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
@ -326,11 +343,11 @@ namespace Optimization {
template<>
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
__m256 v1,v2;
Optimization::permute(v1,in,0); // avx 256; octo-double
v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
v1 = _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1);
v2 = Optimization::Permute::Permute1(v1);
v1 = _mm256_add_ps(v1,v2);
Optimization::permute(v2,v1,2);
v2 = Optimization::Permute::Permute2(v1);
v1 = _mm256_add_ps(v1,v2);
u256f conv; conv.v=v1;
return conv.f[0];
@ -341,7 +358,7 @@ namespace Optimization {
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
__m256d v1;
Optimization::permute(v1,in,0); // sse 128; paired complex single
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
v1 = _mm256_add_pd(v1,in);
u256d conv; conv.v = v1;
return Grid::ComplexD(conv.f[0],conv.f[1]);
@ -351,9 +368,9 @@ namespace Optimization {
template<>
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
__m256d v1,v2;
Optimization::permute(v1,in,0); // avx 256; quad double
v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
v1 = _mm256_add_pd(v1,in);
Optimization::permute(v2,v1,1);
v2 = Optimization::Permute::Permute1(v1);
v1 = _mm256_add_pd(v1,v2);
u256d conv; conv.v = v1;
return conv.f[0];
@ -387,13 +404,6 @@ namespace Grid {
_mm_prefetch(ptr,_MM_HINT_T0);
}
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;

View File

@ -174,7 +174,7 @@ namespace Optimization {
inline __m512d operator()(__m512d a, __m512d b){
__m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
__m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
return _mm512_fmaddsub_pd( a_real, b, a_imag );
}
};
@ -211,26 +211,24 @@ namespace Optimization {
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
}
return _mm512_shuffle_pd(tmp,tmp,0x55);
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
}
@ -239,6 +237,36 @@ namespace Optimization {
// Gpermute utilities consider coalescing into 1 Gpermute
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute3(__m512 in){
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute0(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512d Permute2(__m512d in){
return _mm512_shuffle_pd(in,in,0x55);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
//////////////////////////////////////////////
@ -298,25 +326,6 @@ namespace Grid {
}
// Gpermute utilities consider coalescing into 1 Gpermute
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
union {
__m512 f;
decltype(VectorSIMD::v) v;
} conv;
conv.v = b.v;
switch(perm){
case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
y.v=conv.v;
};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;

View File

@ -255,7 +255,36 @@ namespace Optimization {
};
struct Permute{
static inline __m512 Permute0(__m512 in){
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512 Permute1(__m512 in){
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m512 Permute2(__m512 in){
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
};
static inline __m512 Permute3(__m512 in){
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
};
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m512d Permute1(__m512d in){
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
};
static inline __m512d Permute2(__m512d in){
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
};
static inline __m512d Permute3(__m512d in){
return in;
};
};
//////////////////////////////////////////////
@ -315,25 +344,6 @@ namespace Grid {
}
// Gpermute utilities consider coalescing into 1 Gpermute
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
union {
__m512 f;
decltype(VectorSIMD::v) v;
} conv;
conv.v = b.v;
switch(perm){
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
y.v=conv.v;
};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;

View File

@ -151,10 +151,10 @@ namespace Optimization {
// Complex float
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_ps(ymm0,ymm1);
}
@ -201,7 +201,7 @@ namespace Optimization {
//Complex single
inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
}
//Complex double
inline __m128d operator()(__m128d in, __m128d ret){
@ -215,7 +215,7 @@ namespace Optimization {
struct TimesI{
//Complex single
inline __m128 operator()(__m128 in, __m128 ret){
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
}
//Complex double
@ -225,27 +225,45 @@ namespace Optimization {
}
};
struct Permute{
static inline __m128 Permute0(__m128 in){
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
};
static inline __m128 Permute1(__m128 in){
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
};
static inline __m128 Permute2(__m128 in){
return in;
};
static inline __m128 Permute3(__m128 in){
return in;
};
static inline __m128d Permute0(__m128d in){
return _mm_shuffle_pd(in,in,0x1);
};
static inline __m128d Permute1(__m128d in){
return in;
};
static inline __m128d Permute2(__m128d in){
return in;
};
static inline __m128d Permute3(__m128d in){
return in;
};
};
//////////////////////////////////////////////
// Some Template specialization
template < typename vtype >
void permute(vtype &a, vtype b, int perm) {
uconv<vtype> conv;
conv.v = b;
switch(perm){
case 3: break; //empty for SSE4
case 2: break; //empty for SSE4
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
default: assert(0); break;
}
a=conv.v;
};
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
__m128 v1; // two complex
Optimization::permute(v1,in,0);
v1= Optimization::Permute::Permute0(in);
v1= _mm_add_ps(v1,in);
u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
@ -254,9 +272,9 @@ namespace Optimization {
template<>
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
__m128 v1,v2; // quad single
Optimization::permute(v1,in,0);
v1= Optimization::Permute::Permute0(in);
v1= _mm_add_ps(v1,in);
Optimization::permute(v2,v1,1);
v2= Optimization::Permute::Permute1(v1);
v1 = _mm_add_ps(v1,v2);
u128f conv; conv.v=v1;
return conv.f[0];
@ -274,7 +292,7 @@ namespace Optimization {
template<>
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
__m128d v1;
Optimization::permute(v1,in,0); // avx 256; quad double
v1 = Optimization::Permute::Permute0(in);
v1 = _mm_add_pd(v1,in);
u128d conv; conv.v = v1;
return conv.f[0];
@ -302,14 +320,6 @@ namespace Grid {
inline void prefetch_HINT_T0(const char *ptr){
_mm_prefetch(ptr,_MM_HINT_T0);
}
// Gpermute function
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;

View File

@ -251,15 +251,30 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend inline void permute0(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute0(b.v);
}
friend inline void permute1(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute1(b.v);
}
friend inline void permute2(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute2(b.v);
}
friend inline void permute3(Grid_simd &y,Grid_simd b){
y.v = Optimization::Permute::Permute3(b.v);
}
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
{
Gpermute<Grid_simd>(y,b,perm);
if (perm==3) permute3(y,b);
else if (perm==2) permute2(y,b);
else if (perm==1) permute1(y,b);
else if (perm==0) permute0(y,b);
}
};// end of Grid_simd class definition
///////////////////////
// Splat
///////////////////////