mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-10 03:17:07 +01:00
Updating to modify non-inlining permute routines and hopefully get better reg use and
enhance performance.
This commit is contained in:
@ -183,11 +183,11 @@ namespace Optimization {
|
||||
// Complex float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
__m256 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
// FIXME AVX2 could MAC
|
||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_ps(ymm0,ymm1);
|
||||
}
|
||||
@ -270,7 +270,7 @@ namespace Optimization {
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
|
||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
|
||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(__m256d in, __m256d ret){
|
||||
@ -282,7 +282,7 @@ namespace Optimization {
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
|
||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
|
||||
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
|
||||
}
|
||||
//Complex double
|
||||
@ -296,27 +296,44 @@ namespace Optimization {
|
||||
// Some Template specialization
|
||||
//////////////////////////////////////////////
|
||||
|
||||
template < typename vtype >
|
||||
void permute(vtype &a,vtype b, int perm) {
|
||||
uconv<vtype> conv;
|
||||
conv.v = b;
|
||||
switch (perm){
|
||||
// 8x32 bits=>3 permutes
|
||||
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
a = conv.v;
|
||||
}
|
||||
struct Permute{
|
||||
|
||||
static inline __m256 Permute0(__m256 in){
|
||||
return _mm256_permute2f128_ps(in,in,0x01);
|
||||
};
|
||||
static inline __m256 Permute1(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m256 Permute2(__m256 in){
|
||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m256 Permute3(__m256 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m256d Permute0(__m256d in){
|
||||
return _mm256_permute2f128_pd(in,in,0x01);
|
||||
};
|
||||
static inline __m256d Permute1(__m256d in){
|
||||
return _mm256_shuffle_pd(in,in,0x5);
|
||||
};
|
||||
static inline __m256d Permute2(__m256d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m256d Permute3(__m256d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad complex single
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v1=Optimization::Permute::Permute0(in); // avx 256; quad complex single
|
||||
v1= _mm256_add_ps(v1,in);
|
||||
v2=Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
u256f conv; conv.v = v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
@ -326,11 +343,11 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; octo-double
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 256; octo-double
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
Optimization::permute(v2,v1,2);
|
||||
v2 = Optimization::Permute::Permute2(v1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
u256f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
@ -341,7 +358,7 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1;
|
||||
Optimization::permute(v1,in,0); // sse 128; paired complex single
|
||||
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
u256d conv; conv.v = v1;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
@ -351,9 +368,9 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
||||
v1 = Optimization::Permute::Permute0(in); // avx 256; quad double
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v2 = Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm256_add_pd(v1,v2);
|
||||
u256d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
@ -387,13 +404,6 @@ namespace Grid {
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
Optimization::permute(y.v,b.v,perm);
|
||||
};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
|
@ -174,7 +174,7 @@ namespace Optimization {
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
__m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
|
||||
__m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
|
||||
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
|
||||
a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
|
||||
return _mm512_fmaddsub_pd( a_real, b, a_imag );
|
||||
}
|
||||
};
|
||||
@ -211,26 +211,24 @@ namespace Optimization {
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
}
|
||||
|
||||
|
||||
return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
|
||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
|
||||
@ -239,6 +237,36 @@ namespace Optimization {
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
struct Permute{
|
||||
|
||||
static inline __m512 Permute0(__m512 in){
|
||||
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute1(__m512 in){
|
||||
return _mm512_shuffle_f32x4(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512 Permute2(__m512 in){
|
||||
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute3(__m512 in){
|
||||
return _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
|
||||
static inline __m512d Permute0(__m512d in){
|
||||
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512d Permute1(__m512d in){
|
||||
return _mm512_shuffle_f64x2(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512d Permute2(__m512d in){
|
||||
return _mm512_shuffle_pd(in,in,0x55);
|
||||
};
|
||||
static inline __m512d Permute3(__m512d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -298,25 +326,6 @@ namespace Grid {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
union {
|
||||
__m512 f;
|
||||
decltype(VectorSIMD::v) v;
|
||||
} conv;
|
||||
conv.v = b.v;
|
||||
switch(perm){
|
||||
case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
y.v=conv.v;
|
||||
};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
|
@ -255,7 +255,36 @@ namespace Optimization {
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m512 Permute0(__m512 in){
|
||||
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512 Permute1(__m512 in){
|
||||
return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m512 Permute2(__m512 in){
|
||||
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
|
||||
};
|
||||
static inline __m512 Permute3(__m512 in){
|
||||
return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB);
|
||||
};
|
||||
|
||||
static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
|
||||
return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m512d Permute1(__m512d in){
|
||||
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
|
||||
};
|
||||
static inline __m512d Permute2(__m512d in){
|
||||
return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
|
||||
};
|
||||
static inline __m512d Permute3(__m512d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -315,25 +344,6 @@ namespace Grid {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Gpermute utilities consider coalescing into 1 Gpermute
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
union {
|
||||
__m512 f;
|
||||
decltype(VectorSIMD::v) v;
|
||||
} conv;
|
||||
conv.v = b.v;
|
||||
switch(perm){
|
||||
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
|
||||
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
|
||||
case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
y.v=conv.v;
|
||||
};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
|
@ -151,10 +151,10 @@ namespace Optimization {
|
||||
// Complex float
|
||||
inline __m128 operator()(__m128 a, __m128 b){
|
||||
__m128 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm_shuffle_ps(b,b,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm_addsub_ps(ymm0,ymm1);
|
||||
}
|
||||
@ -201,7 +201,7 @@ namespace Optimization {
|
||||
//Complex single
|
||||
inline __m128 operator()(__m128 in, __m128 ret){
|
||||
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
|
||||
return _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
|
||||
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
}
|
||||
//Complex double
|
||||
inline __m128d operator()(__m128d in, __m128d ret){
|
||||
@ -215,7 +215,7 @@ namespace Optimization {
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m128 operator()(__m128 in, __m128 ret){
|
||||
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1));
|
||||
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
|
||||
}
|
||||
//Complex double
|
||||
@ -225,27 +225,45 @@ namespace Optimization {
|
||||
}
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
|
||||
static inline __m128 Permute0(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
};
|
||||
static inline __m128 Permute1(__m128 in){
|
||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
};
|
||||
static inline __m128 Permute2(__m128 in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128 Permute3(__m128 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static inline __m128d Permute0(__m128d in){
|
||||
return _mm_shuffle_pd(in,in,0x1);
|
||||
};
|
||||
static inline __m128d Permute1(__m128d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128d Permute2(__m128d in){
|
||||
return in;
|
||||
};
|
||||
static inline __m128d Permute3(__m128d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
template < typename vtype >
|
||||
void permute(vtype &a, vtype b, int perm) {
|
||||
uconv<vtype> conv;
|
||||
conv.v = b;
|
||||
switch(perm){
|
||||
case 3: break; //empty for SSE4
|
||||
case 2: break; //empty for SSE4
|
||||
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
a=conv.v;
|
||||
};
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
|
||||
__m128 v1; // two complex
|
||||
Optimization::permute(v1,in,0);
|
||||
v1= Optimization::Permute::Permute0(in);
|
||||
v1= _mm_add_ps(v1,in);
|
||||
u128f conv; conv.v=v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
@ -254,9 +272,9 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
|
||||
__m128 v1,v2; // quad single
|
||||
Optimization::permute(v1,in,0);
|
||||
v1= Optimization::Permute::Permute0(in);
|
||||
v1= _mm_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v2= Optimization::Permute::Permute1(v1);
|
||||
v1 = _mm_add_ps(v1,v2);
|
||||
u128f conv; conv.v=v1;
|
||||
return conv.f[0];
|
||||
@ -274,7 +292,7 @@ namespace Optimization {
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
|
||||
__m128d v1;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
||||
v1 = Optimization::Permute::Permute0(in);
|
||||
v1 = _mm_add_pd(v1,in);
|
||||
u128d conv; conv.v = v1;
|
||||
return conv.f[0];
|
||||
@ -302,14 +320,6 @@ namespace Grid {
|
||||
inline void prefetch_HINT_T0(const char *ptr){
|
||||
_mm_prefetch(ptr,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
|
||||
// Gpermute function
|
||||
template < typename VectorSIMD >
|
||||
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||
Optimization::permute(y.v,b.v,perm);
|
||||
}
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
|
@ -251,15 +251,30 @@ namespace Grid {
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute0(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute0(b.v);
|
||||
}
|
||||
friend inline void permute1(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute1(b.v);
|
||||
}
|
||||
friend inline void permute2(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute2(b.v);
|
||||
}
|
||||
friend inline void permute3(Grid_simd &y,Grid_simd b){
|
||||
y.v = Optimization::Permute::Permute3(b.v);
|
||||
}
|
||||
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
||||
{
|
||||
Gpermute<Grid_simd>(y,b,perm);
|
||||
if (perm==3) permute3(y,b);
|
||||
else if (perm==2) permute2(y,b);
|
||||
else if (perm==1) permute1(y,b);
|
||||
else if (perm==0) permute0(y,b);
|
||||
}
|
||||
|
||||
|
||||
|
||||
};// end of Grid_simd class definition
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
Reference in New Issue
Block a user