diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h new file mode 100644 index 00000000..1bb67179 --- /dev/null +++ b/Grid/simd/Grid_a64fx-2.h @@ -0,0 +1,977 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_a64fx-1.h + + Copyright (C) 2020 + +Author: Nils Meyer + + Copyright (C) 2015 + Copyright (C) 2017 + +Author: Antonin Portelli + Andrew Lawson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE +///////////////////////////////////////////////////// + +static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); + +#ifdef __ARM_FEATURE_SVE + #ifdef __clang__ + //#pragma message("Using clang compiler") + #include + #endif +#else + #pragma error "Missing SVE feature" +#endif /* __ARM_FEATURE_SVE */ + +namespace Grid { +namespace Optimization { + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; + +}} // Grid::Optimization + + +// low-level API +namespace Grid { +namespace Optimization { + +template +struct acle{}; + +template <> +struct acle{ + typedef svfloat64_t vt; + typedef svfloat64x2_t vt2; + typedef svfloat64x4_t vt4; + typedef float64_t pt; + typedef uint64_t uint; + typedef svuint64_t svuint; + + static inline svbool_t pg1(){return svptrue_b64();} + static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} + static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} + static inline vec tbl_swap(){ + const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; + } + static inline vec tbl0(){ + const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; + } + static inline vec tbl1(){ + const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline svfloat64_t zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + typedef svfloat32_t vt; + typedef svfloat32x2_t vt2; + typedef float32_t pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + // exchange neighboring elements + static inline vec tbl_swap(){ + const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; + } + static inline vec tbl0(){ + const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; + } + static inline vec tbl1(){ + const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; + } + static inline vec tbl2(){ + const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline svfloat32_t zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + typedef svfloat16_t vt; + typedef float16_t pt; + typedef uint16_t uint; + typedef svuint16_t svuint; + + static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);} + static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline svfloat16_t zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + typedef svuint32_t vt; + typedef svuint32x2_t vt2; + typedef Integer pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + //static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f32(a); + typename acle::vt b_v = svdup_f32(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real float + inline vecf operator()(float a){ + + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f32(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Complex double + inline vecd operator()(double a, double b){ + + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f64(a); + typename acle::vt b_v = svdup_f64(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real double + inline vecd operator()(double a){ + + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f64(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Integer + inline vec operator()(Integer a){ + + vec out; + svbool_t pg1 = acle::pg1(); + // Add check whether Integer is really a uint32_t??? + typename acle::vt r_v = svdup_u32(a); + svst1(pg1, out.v, r_v); + return out; + } +}; + + struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + // NOTE illegal '&' here causes SIGBUS at runtime, related to CAS-35230-H2H6T1 + // svst1(pg1, (typename acle::pt*)&D, a_v); + svst1(pg1, D, a_v); + + // non temporal version + //svstnt1(pg1, D, a_v); + } + }; + + struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt b_v = svld1(pg1, b.v); + // FIXME non-temporal store causes compiler crash CAS-35230-H2H6T1 + svstnt1(pg1, a, b_v); + //svst1(pg1, a, b_v); + } + }; + + struct Vset{ + // Complex + template + inline vec operator()(std::complex *a){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (T*)a); + svst1(pg1, out.v, a_v); + + return out; + } + + // Real + template + inline vec operator()(T *a){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + svst1(pg1, out.v, a_v); + + return out; + } + }; + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + + + struct Sum{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + struct Sub{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + +struct Mult{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svmul_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + + // using FCMLA + typename acle::vt z_v = acle::zero(); + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MaddRealPart{ + template + inline vec operator()(vec a, vec b, vec c){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultComplex{ + // Complex a*b + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt z_v = __svzero(z_v); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 90); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Div{ + // Real + template + inline vec operator()(vec a, vec b){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svdiv_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Conj{ + // Complex + template + inline vec operator()(vec a){ + + vec out; + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + + + struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ + + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ + + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_even, a_v); + svst1(pg1, out.v, r_v); + + return out; + } + }; + + +struct PrecisionChange { + static inline vech StoH (const vecf &sa,const vecf &sb) { + + vech ret; + svbool_t pg1s = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt sa_v = svld1(pg1s, sa.v); + typename acle::vt sb_v = svld1(pg1s, sb.v); + typename acle::vt ha_v = svcvt_f16_x(pg1s, sa_v); + typename acle::vt hb_v = svcvt_f16_x(pg1s, sb_v); + typename acle::vt r_v = svuzp1(ha_v, hb_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + return ret; + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + + svbool_t pg1h = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt ha_v = svzip1(h_v, h_v); + typename acle::vt hb_v = svzip2(h_v, h_v); + typename acle::vt sa_v = svcvt_f32_x(pg1s, ha_v); + typename acle::vt sb_v = svcvt_f32_x(pg1s, hb_v); + svst1(pg1s, sa.v, sa_v); + svst1(pg1s, sb.v, sb_v); + } + static inline vecf DtoS (vecd a,vecd b) { + + vecf ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt sa_v = svcvt_f32_x(pg1d, a_v); + typename acle::vt sb_v = svcvt_f32_x(pg1d, b_v); + typename acle::vt r_v = svuzp1(sa_v, sb_v); + svst1(pg1s, ret.v, r_v); + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + + svbool_t pg1s = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt s_v = svld1(pg1s, s.v); + typename acle::vt sa_v = svzip1(s_v, s_v); + typename acle::vt sb_v = svzip2(s_v, s_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, sa_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, sb_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + + vech ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt c_v = svld1(pg1d, c.v); + typename acle::vt d_v = svld1(pg1d, d.v); + typename acle::vt ha_v = svcvt_f16_x(pg1d, a_v); + typename acle::vt hb_v = svcvt_f16_x(pg1d, b_v); + typename acle::vt hc_v = svcvt_f16_x(pg1d, c_v); + typename acle::vt hd_v = svcvt_f16_x(pg1d, d_v); + typename acle::vt hab_v = svuzp1(ha_v, hb_v); + typename acle::vt hcd_v = svuzp1(hc_v, hd_v); + typename acle::vt r_v = svuzp1(hab_v, hcd_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; +/* + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); +*/ + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + + svbool_t pg1h = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt sa_v = svzip1(h_v, h_v); + typename acle::vt sb_v = svzip2(h_v, h_v); + typename acle::vt da_v = svzip1(sa_v, sa_v); + typename acle::vt db_v = svzip2(sa_v, sa_v); + typename acle::vt dc_v = svzip1(sb_v, sb_v); + typename acle::vt dd_v = svzip2(sb_v, sb_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, da_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, db_v); + typename acle::vt c_v = svcvt_f64_x(pg1d, dc_v); + typename acle::vt d_v = svcvt_f64_x(pg1d, dd_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + svst1(pg1d, c.v, c_v); + svst1(pg1d, d.v, d_v); +/* + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); +*/ + } +}; + + + struct Exchange{ + + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg4 = acle::pg4(); + typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); + typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); + typename acle::vt4 out1_v4; + typename acle::vt4 out2_v4; + out1_v4.v0 = in1_v4.v0; + out1_v4.v1 = in1_v4.v1; + out1_v4.v2 = in2_v4.v0; + out1_v4.v3 = in2_v4.v1; + out2_v4.v0 = in1_v4.v2; + out2_v4.v1 = in1_v4.v3; + out2_v4.v2 = in2_v4.v2; + out2_v4.v3 = in2_v4.v3; + svst4(pg4, (typename acle::pt*)out1.v, out1_v4); + svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + } + + template + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } + + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } +}; + + +struct Permute{ + + // Permute0 is valid for any SVE vector width + template + static inline vec Permute0(vec in) { + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); + svst1(pg1, out.v, r_v); + return out; + } + + static inline vecd Permute1(vecd in) { + + vecd out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute1(vecf in) { + + vecf out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute2(vecd in) { + + vecd out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute2(vecf in) { + + vecf out; + const vec::uint> tbl_swap = acle::tbl2(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute3(vecf in) { + + vecf out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute3(vecd in) { + return in; + } + +}; + +struct Rotate{ + + template static inline vec tRotate(vec in){ + + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(n%W::r)); + svst1(pg1, out.v, r_v); + + return out; + } + + template + static inline vec rotate(vec in, int n){ + + switch(n){ + case 0: return tRotate<0, T>(in); break; + case 1: return tRotate<1, T>(in); break; + case 2: return tRotate<2, T>(in); break; + case 3: return tRotate<3, T>(in); break; + case 4: return tRotate<4, T>(in); break; + case 5: return tRotate<5, T>(in); break; + case 6: return tRotate<6, T>(in); break; + case 7: return tRotate<7, T>(in); break; + + case 8: return tRotate<8, T>(in); break; + case 9: return tRotate<9, T>(in); break; + case 10: return tRotate<10, T>(in); break; + case 11: return tRotate<11, T>(in); break; + case 12: return tRotate<12, T>(in); break; + case 13: return tRotate<13, T>(in); break; + case 14: return tRotate<14, T>(in); break; + case 15: return tRotate<15, T>(in); break; + default: assert(0); + } + } +}; + +// ======================================================================= +/* SVE ACLE reducedoes not compile, check later + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type &in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg_even, a_v); + float b = svred(pg_odd, a_v); + + return Grid::ComplexF(a, b); + +} + +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg1, a_v); + + return a; +} + +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg_even, a_v); + double b = svred(pg_odd, a_v); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg1, a_v); + + return a; +} + +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + Integer a = svred(pg1, a_v); + + return a; +} + +#undef svred +*/ + +// ======================================================================= + + +#define acc(v, a, off, step, n)\ +for (unsigned int i = off; i < n; i += step)\ +{\ + a += v[i];\ +} + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + float a = 0.f, b = 0.f; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexF(a, b); +} + +//Real float Reduce +template<> +inline Grid::RealF Reduce::operator()(vecf in){ + float a = 0.; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +//Complex double Reduce +template<> +inline Grid::ComplexD Reduce::operator()(vecd in){ + double a = 0., b = 0.; + + acc(in.v, a, 0, 2, W::r); + acc(in.v, b, 1, 2, W::r); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template<> +inline Grid::RealD Reduce::operator()(vecd in){ + double a = 0.f; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +//Integer Reduce +template<> +inline Integer Reduce::operator()(veci in){ + Integer a = 0; + + acc(in.v, a, 0, 1, W::r); + + return a; +} + +#undef acc // EIGEN compatibility + + +} // Optimization + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + + typedef Optimization::vech SIMD_Htype; // Reduced precision type + typedef Optimization::vecf SIMD_Ftype; // Single precision type + typedef Optimization::vecd SIMD_Dtype; // Double precision type + typedef Optimization::veci SIMD_Itype; // Integer type + + // prefetch utilities + inline void v_prefetch0(int size, const char *ptr){}; + inline void prefetch_HINT_T0(const char *ptr){}; + + // Function name aliases + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + typedef Optimization::Vset VsetSIMD; + typedef Optimization::Vstream VstreamSIMD; + template using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e2b1fd07..c726660f 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_vector_types.h @@ -73,7 +73,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; Grid_half o; - + o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; f.u ^= sign; @@ -93,7 +93,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { o.x = static_cast(f.u - denorm_magic.u); } else { unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - + // update exponent, rounding bias part 1 f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; // rounding bias part 2 @@ -101,7 +101,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { // take the bits! o.x = static_cast(f.u >> 13); } - } + } o.x |= static_cast(sign >> 16); return o; } @@ -110,9 +110,21 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GPU_VEC #include "Grid_gpu_vec.h" #endif +/* #ifdef GEN #include "Grid_generic.h" #endif +*/ +#ifdef GEN + #if defined A64FX // breakout A64FX SVE ACLE here + #pragma message("building for A64FX / SVE ACLE") + #define ARMCLANGHOTFIX + #include "Grid_a64fx-2.h" + #endif +#else + #include "Grid_generic.h" +#endif + #ifdef SSE4 #include "Grid_sse4.h" #endif @@ -170,7 +182,7 @@ template struct is_real struct is_integer : public std::false_type {}; template struct is_integer::value, void>::type> : public std::true_type {}; - + template using IfReal = Invoke::value, int> >; template using IfComplex = Invoke::value, int> >; template using IfInteger = Invoke::value, int> >; @@ -223,6 +235,21 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } +#ifdef ARMCLANGHOTFIX + accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { + svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); + svst1(svptrue_b8(), (int8_t*)this, tmp); + //v = rhs.v; + return *this; + }; + + accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) { + svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); + svst1(svptrue_b8(), (int8_t*)this, tmp); + //v = rhs.v; + return *this; + }; +#else accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { v = rhs.v; return *this; @@ -231,7 +258,7 @@ public: v = rhs.v; return *this; }; // faster than not declaring it and leaving to the compiler - +#endif accelerator Grid_simd() = default; accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps @@ -263,7 +290,7 @@ public: const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; - + friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -412,7 +439,7 @@ public: Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -441,7 +468,7 @@ public: return ret; } /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend accelerator_inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -452,20 +479,20 @@ public: Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } - friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); } //////////////////////////////////////////////////////////////////// @@ -490,7 +517,7 @@ public: int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -564,29 +591,29 @@ accelerator_inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> accelerator_inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> accelerator_inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template accelerator_inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> accelerator_inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -877,7 +904,7 @@ accelerator_inline typename toComplexMapper::Complexified toComplex(const conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); + assert(conv.s[i + 1] == conv.s[i]); // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match