1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

clean up; Exch1 sp integrate, tested, working

This commit is contained in:
nmeyer-ur 2020-05-21 08:45:43 +02:00
parent f8c0a59221
commit cd27f1005d

View File

@ -60,36 +60,20 @@ typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs fo
#pragma error("Oops. Illegal SVE vector size!?") #pragma error("Oops. Illegal SVE vector size!?")
#endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE_BITS */
// safety definition, not sure if it's necessary
//#define GEN_SIMD_WIDTH 64u
// low-level API // low-level API
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Optimization); NAMESPACE_BEGIN(Optimization);
// convenience union types for tables eliminate loads // convenience union types for tables eliminating loads
union ulutf { union ulutf {
lutf v; lutf v;
uint32_t s[16]; uint32_t s[16];
}; };
union ulutd { union ulutd {
lutd v; lutd v;
uint64_t s[8]; uint64_t s[8];
}; };
// FIXME convenience union types for Exchange1
union uvecf {
vecf v;
float32_t s[16];
};
union uvecd {
vecd v;
float64_t s[8];
};
template <typename T> template <typename T>
struct acle{}; struct acle{};
@ -144,6 +128,18 @@ struct acle<float>{
const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} };
return t.v; return t.v;
} }
static inline lutf tbl_exch1a(){ // Exchange1
const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } };
return t.v;
}
static inline lutf tbl_exch1b(){ // Exchange1
const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } };
return t.v;
}
static inline lutf tbl_exch1c(){ // Exchange1
const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} };
return t.v;
}
static inline pred pg1(){return svptrue_b32();} static inline pred pg1(){return svptrue_b32();}
static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());}
static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());}
@ -191,7 +187,6 @@ struct Vsplat{
} }
// Integer // Integer
inline veci operator()(Integer a){ inline veci operator()(Integer a){
// Add check whether Integer is really a uint32_t???
return svdup_u32(a); return svdup_u32(a);
} }
}; };
@ -538,9 +533,6 @@ struct PrecisionChange {
} }
}; };
#define VECTOR_FOR(i, w, inc) \
for (unsigned int i = 0; i < w; i += inc)
struct Exchange{ struct Exchange{
// float // float
static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){
@ -550,25 +542,18 @@ struct Exchange{
out2 = svext(in1, r2_v, (uint64_t)8u); out2 = svext(in1, r2_v, (uint64_t)8u);
} }
static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){
// FIXME // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
uvecf v1 = { .v = in1 }; // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI
uvecf v2 = { .v = in2 }; lutf tbl_exch1a = acle<float>::tbl_exch1a();
uvecf o1, o2; lutf tbl_exch1b = acle<float>::tbl_exch1b();
lutf tbl_exch1c = acle<float>::tbl_exch1c();
const int n = 1; vecf a1_v = svtbl(in1, tbl_exch1a);
const int w = 16; // w = W<T>::r vecf a2_v = svtbl(in2, tbl_exch1b);
unsigned int mask = w >> (n + 1); vecf b1_v = svext(a2_v, a1_v, (uint64_t)8u);
// std::cout << " Exchange "<<n<<" nsimd "<<w<<" mask 0x" <<std::hex<<mask<<std::dec<<std::endl; vecf b2_v = svext(a1_v, a2_v, (uint64_t)8u);
VECTOR_FOR(i, w, 1) { out1 = svtbl(b1_v, tbl_exch1c);
int j1 = i&(~mask); out2 = svtbl(b2_v, tbl_exch1a);
if ( (i&mask) == 0 ) { o1.s[i]=v1.s[j1];}
else { o1.s[i]=v2.s[j1];}
int j2 = i|mask;
if ( (i&mask) == 0 ) { o2.s[i]=v1.s[j2];}
else { o2.s[i]=v2.s[j2];}
}
out1 = o1.v;
out2 = o2.v;
} }
static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){ static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){
out1 = (vecf)svtrn1((vecd)in1, (vecd)in2); out1 = (vecf)svtrn1((vecd)in1, (vecd)in2);
@ -588,14 +573,15 @@ struct Exchange{
} }
static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){
// this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1
// alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI
lutd tbl_exch1a = acle<double>::tbl_exch1a(); lutd tbl_exch1a = acle<double>::tbl_exch1a();
lutd tbl_exch1b = acle<double>::tbl_exch1b(); lutd tbl_exch1b = acle<double>::tbl_exch1b();
lutd tbl_exch1c = acle<double>::tbl_exch1c(); lutd tbl_exch1c = acle<double>::tbl_exch1c();
vecd a1_v = svtbl(in1, tbl_exch1a); vecd a1_v = svtbl(in1, tbl_exch1a);
vecd a2_v = svtbl(in2, tbl_exch1b); vecd a2_v = svtbl(in2, tbl_exch1b);
vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u); vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u);
vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u); vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u);
out1 = svtbl(b1_v, tbl_exch1c); out1 = svtbl(b1_v, tbl_exch1c);
out2 = svtbl(b2_v, tbl_exch1a); out2 = svtbl(b2_v, tbl_exch1a);
} }