From a65ce237c1b8feb62f96b42b2dc57ed0d5168348 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Thu, 21 May 2020 09:48:06 +0200 Subject: [PATCH] clean up; Exch1 VLA sp+dp integrate, tested, working --- Grid/simd/Grid_a64fx-2.h | 174 ++++++++++++++------------------------- 1 file changed, 64 insertions(+), 110 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index c03184df..0333299f 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -30,21 +30,8 @@ // Using SVE ACLE ///////////////////////////////////////////////////// -#ifndef GEN_SIMD_WIDTH -#define GEN_SIMD_WIDTH 64u -#endif - static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); -#ifdef __ARM_FEATURE_SVE - #ifdef __clang__ - //#pragma message("Using clang compiler") - #include - #endif -#else - #pragma error "Missing SVE feature" -#endif /* __ARM_FEATURE_SVE */ - NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Optimization); @@ -104,16 +91,28 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; - return t; + const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; } static inline vec tbl0(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; - return t; + const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; } static inline vec tbl1(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; - return t; + const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + return t; } static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} @@ -132,20 +131,32 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} // exchange neighboring elements static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - return t; + const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; } static inline vec tbl0(){ - const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; - return t; + const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; } static inline vec tbl1(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; - return t; + const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; } static inline vec tbl2(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - return t; + const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + const vec t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + const vec t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + const vec t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + return t; } static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} @@ -186,7 +197,6 @@ struct acle{ struct Vsplat{ // Complex float inline vecf operator()(float a, float b){ - vecf out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svdup_f32(a); @@ -198,7 +208,6 @@ struct Vsplat{ // Real float inline vecf operator()(float a){ - vecf out; svbool_t pg1 = acle::pg1(); typename acle::vt r_v = svdup_f32(a); @@ -208,7 +217,6 @@ struct Vsplat{ // Complex double inline vecd operator()(double a, double b){ - vecd out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svdup_f64(a); @@ -220,7 +228,6 @@ struct Vsplat{ // Real double inline vecd operator()(double a){ - vecd out; svbool_t pg1 = acle::pg1(); typename acle::vt r_v = svdup_f64(a); @@ -230,7 +237,6 @@ struct Vsplat{ // Integer inline vec operator()(Integer a){ - vec out; svbool_t pg1 = acle::pg1(); // Add check whether Integer is really a uint32_t??? @@ -244,7 +250,6 @@ struct Vstore{ // Real template inline void operator()(vec a, T *D){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); svst1(pg1, D, a_v); @@ -255,7 +260,6 @@ struct Vstream{ // Real template inline void operator()(T * a, vec b){ - svbool_t pg1 = acle::pg1(); typename acle::vt b_v = svld1(pg1, b.v); svstnt1(pg1, a, b_v); @@ -267,7 +271,6 @@ struct Vstream{ // Complex template inline vec operator()(std::complex *a){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, (T*)a); @@ -279,7 +282,6 @@ struct Vstream{ // Real template inline vec operator()(T *a){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a); @@ -296,7 +298,6 @@ struct Vstream{ struct Sum{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -311,7 +312,6 @@ struct Sum{ struct Sub{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -326,7 +326,6 @@ struct Sub{ struct Mult{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -341,7 +340,6 @@ struct Mult{ struct MultRealPart{ template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -360,7 +358,6 @@ struct MultRealPart{ struct MaddRealPart{ template inline vec operator()(vec a, vec b, vec c){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -380,7 +377,6 @@ struct MultComplex{ // Complex a*b template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -400,8 +396,7 @@ struct MultComplex{ struct MultAddComplex{ // Complex a*b+c template - inline mac(const vec &a, const vec b, const vec c){ - + inline void mac(const vec &a, const vec b, const vec c){ vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -420,7 +415,6 @@ struct Div{ // Real template inline vec operator()(vec a, vec b){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, a.v); @@ -436,7 +430,6 @@ struct Conj{ // Complex template inline vec operator()(vec a){ - vec out; svbool_t pg1 = acle::pg1(); svbool_t pg_odd = acle::pg_odd(); @@ -453,7 +446,6 @@ struct TimesMinusI{ // Complex template inline vec operator()(vec a, vec b){ - vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -473,7 +465,6 @@ struct TimesI{ // Complex template inline vec operator()(vec a, vec b){ - vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -492,7 +483,6 @@ struct TimesI{ struct PrecisionChange { static inline vech StoH (const vecf &sa,const vecf &sb) { - vech ret; svbool_t pg1s = acle::pg1(); svbool_t pg1h = acle::pg1(); @@ -502,10 +492,10 @@ struct PrecisionChange { typename acle::vt hb_v = svcvt_f16_x(pg1s, sb_v); typename acle::vt r_v = svuzp1(ha_v, hb_v); svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + return ret; } static inline void HtoS(vech h,vecf &sa,vecf &sb) { - svbool_t pg1h = acle::pg1(); svbool_t pg1s = acle::pg1(); typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); @@ -517,7 +507,6 @@ struct PrecisionChange { svst1(pg1s, sb.v, sb_v); } static inline vecf DtoS (vecd a,vecd b) { - vecf ret; svbool_t pg1d = acle::pg1(); svbool_t pg1s = acle::pg1(); @@ -527,10 +516,10 @@ struct PrecisionChange { typename acle::vt sb_v = svcvt_f32_x(pg1d, b_v); typename acle::vt r_v = svuzp1(sa_v, sb_v); svst1(pg1s, ret.v, r_v); + return ret; } static inline void StoD (vecf s,vecd &a,vecd &b) { - svbool_t pg1s = acle::pg1(); svbool_t pg1d = acle::pg1(); typename acle::vt s_v = svld1(pg1s, s.v); @@ -542,7 +531,6 @@ struct PrecisionChange { svst1(pg1d, b.v, b_v); } static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { - vech ret; svbool_t pg1d = acle::pg1(); svbool_t pg1h = acle::pg1(); @@ -568,7 +556,6 @@ struct PrecisionChange { */ } static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { - svbool_t pg1h = acle::pg1(); svbool_t pg1d = acle::pg1(); typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); @@ -600,7 +587,6 @@ struct Exchange{ // Exchange0 is valid for arbitrary SVE vector length template static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); typename acle::vt a1_v = svld1(pg1, in1.v); typename acle::vt a2_v = svld1(pg1, in2.v); @@ -612,55 +598,35 @@ struct Exchange{ svst1(pg1, out2.v, r2_v); } - - -/* FIXME use svcreate etc. or switch to table lookup directly template static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into ldp + stp -> SFI + svbool_t pg1 = acle::pg1(); + const vec::uint> tbl_exch1a = acle::tbl_exch1a(); + const vec::uint> tbl_exch1b = acle::tbl_exch1b(); + const vec::uint> tbl_exch1c = acle::tbl_exch1c(); - svbool_t pg4 = acle::pg4(); - typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); - typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); - typename acle::vt4 out1_v4; - typename acle::vt4 out2_v4; - out1_v4.v0 = in1_v4.v0; - out1_v4.v1 = in1_v4.v1; - out1_v4.v2 = in2_v4.v0; - out1_v4.v3 = in2_v4.v1; - out2_v4.v0 = in1_v4.v2; - out2_v4.v1 = in1_v4.v3; - out2_v4.v2 = in2_v4.v2; - out2_v4.v3 = in2_v4.v3; - svst4(pg4, (typename acle::pt*)out1.v, out1_v4); - svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + typename acle::svuint tbl_exch1a_v = svld1(pg1, tbl_exch1a.v); + typename acle::svuint tbl_exch1b_v = svld1(pg1, tbl_exch1b.v); + typename acle::svuint tbl_exch1c_v = svld1(pg1, tbl_exch1c.v); + + typename acle::vt in1_v = svld1(pg1, in1.v); + typename acle::vt in2_v = svld1(pg1, in2.v); + + typename acle::vt a1_v = svtbl(in1_v, tbl_exch1a_v); + typename acle::vt a2_v = svtbl(in2_v, tbl_exch1b_v); + typename acle::vt b1_v = svext(a2_v, a1_v, (uint64_t)(W::r / 2u)); + typename acle::vt b2_v = svext(a1_v, a2_v, (uint64_t)(W::r / 2u)); + typename acle::vt out1_v = svtbl(b1_v, tbl_exch1c_v); + typename acle::vt out2_v = svtbl(b2_v, tbl_exch1a_v); + + svst1(pg1, out1.v, out1_v); + svst1(pg1, out2.v, out2_v); } -*/ - - #define VECTOR_FOR(i, w, inc) \ - for (unsigned int i = 0; i < w; i += inc) - - template - static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ - // FIXME - const int n = 1; - const int w = W::r; - unsigned int mask = w >> (n + 1); - // std::cout << " Exchange "< static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); @@ -671,7 +637,6 @@ struct Exchange{ } static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ - svbool_t pg1 = acle::pg1(); typename acle::vt a1_v = svld1(pg1, in1.v); typename acle::vt a2_v = svld1(pg1, in2.v); @@ -692,17 +657,16 @@ struct Permute{ // Permute0 is valid for any SVE vector width template static inline vec Permute0(vec in) { - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); svst1(pg1, out.v, r_v); + return out; } static inline vecd Permute1(vecd in) { - vecd out; const vec::uint> tbl_swap = acle::tbl1(); svbool_t pg1 = acle::pg1(); @@ -715,7 +679,6 @@ struct Permute{ } static inline vecf Permute1(vecf in) { - vecf out; const vec::uint> tbl_swap = acle::tbl1(); svbool_t pg1 = acle::pg1(); @@ -728,7 +691,6 @@ struct Permute{ } static inline vecd Permute2(vecd in) { - vecd out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -741,7 +703,6 @@ struct Permute{ } static inline vecf Permute2(vecf in) { - vecf out; const vec::uint> tbl_swap = acle::tbl2(); svbool_t pg1 = acle::pg1(); @@ -754,7 +715,6 @@ struct Permute{ } static inline vecf Permute3(vecf in) { - vecf out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -775,7 +735,6 @@ struct Permute{ struct Rotate{ template static inline vec tRotate(vec in){ - vec out; svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); @@ -833,7 +792,6 @@ struct Reduce{ //Complex float Reduce template <> inline Grid::ComplexF Reduce::operator()(vecf in){ - svbool_t pg1 = acle::pg1(); svbool_t pg_even = acle::pg_even(); svbool_t pg_odd = acle::pg_odd(); @@ -848,7 +806,6 @@ inline Grid::ComplexF Reduce::operator()(vecf in){ //Real float Reduce template <> inline Grid::RealF Reduce::operator()(vecf in){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); float a = svred(pg1, a_v); @@ -859,7 +816,6 @@ inline Grid::RealF Reduce::operator()(vecf in){ //Complex double Reduce template <> inline Grid::ComplexD Reduce::operator()(vecd in){ - svbool_t pg1 = acle::pg1(); svbool_t pg_even = acle::pg_even(); svbool_t pg_odd = acle::pg_odd(); @@ -873,7 +829,6 @@ inline Grid::ComplexD Reduce::operator()(vecd in){ //Real double Reduce template <> inline Grid::RealD Reduce::operator()(vecd in){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); double a = svred(pg1, a_v); @@ -884,7 +839,6 @@ inline Grid::RealD Reduce::operator()(vecd in){ //Integer Reduce template <> inline Integer Reduce::operator()(veci in){ - svbool_t pg1 = acle::pg1(); typename acle::vt a_v = svld1(pg1, in.v); Integer a = svred(pg1, a_v);