1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-08-01 20:27:07 +01:00

Vectorise the XYZT face gathering better.

Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent
with efficiency.
This commit is contained in:
paboyle
2017-02-15 11:11:04 +00:00
parent aca7a3ef0a
commit bd600702cf
10 changed files with 510 additions and 34 deletions

View File

@@ -343,6 +343,46 @@ namespace Optimization {
};
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
// The operation is its own inverse
struct Exchange{
// 3210 ordering
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
};
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
};
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
out1 = _mm512_shuffle_pd(in1,in2,0x00);
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
};
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
assert(0);
return;
};
};
struct Rotate{