mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-10 19:36:56 +01:00
Vectorise the XYZT face gathering better.
Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent with efficiency.
This commit is contained in:
@ -469,9 +469,47 @@ namespace Optimization {
|
||||
static inline __m256d Permute3(__m256d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
out1= _mm256_permute2f128_ps(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_ps(in1,in2,0x31);
|
||||
};
|
||||
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_permute2f128_pd(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_pd(in1,in2,0x31);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm256_shuffle_pd(in1,in2,0xF);
|
||||
};
|
||||
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#if defined (AVX2)
|
||||
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||
|
@ -343,6 +343,46 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
|
||||
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
|
||||
// The operation is its own inverse
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1 = _mm512_shuffle_pd(in1,in2,0x00);
|
||||
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
|
||||
};
|
||||
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct Rotate{
|
||||
|
||||
|
@ -326,7 +326,43 @@ namespace Optimization {
|
||||
static inline __m128d Permute3(__m128d in){
|
||||
return in;
|
||||
};
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
out1= _mm_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm_shuffle_pd(in1,in2,0x3);
|
||||
};
|
||||
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
@ -350,6 +350,18 @@ class Grid_simd {
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Exchange
|
||||
// Al Ah , Bl Bh -> Al Bl Ah,Bh
|
||||
///////////////////////
|
||||
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
|
||||
{
|
||||
if (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
|
||||
else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
||||
else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
||||
else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
@ -372,23 +384,11 @@ class Grid_simd {
|
||||
int dist = perm & 0xF;
|
||||
y = rotate(b, dist);
|
||||
return;
|
||||
}
|
||||
switch (perm) {
|
||||
case 3:
|
||||
permute3(y, b);
|
||||
break;
|
||||
case 2:
|
||||
permute2(y, b);
|
||||
break;
|
||||
case 1:
|
||||
permute1(y, b);
|
||||
break;
|
||||
case 0:
|
||||
permute0(y, b);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
else if(perm==3) permute3(y, b);
|
||||
else if(perm==2) permute2(y, b);
|
||||
else if(perm==1) permute1(y, b);
|
||||
else if(perm==0) permute0(y, b);
|
||||
}
|
||||
|
||||
}; // end of Grid_simd class definition
|
||||
@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
Reference in New Issue
Block a user