diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 1bb67179..6968ca7a 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -2,17 +2,11 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: ./lib/simd/Grid_a64fx-1.h + Source file: Grid_a64fx-2.h Copyright (C) 2020 -Author: Nils Meyer <nils.meyer@ur.de> - - Copyright (C) 2015 - Copyright (C) 2017 - -Author: Antonin Portelli <antonin.portelli@me.com> - Andrew Lawson <andrew.lawson1991@gmail.com> + Author: Nils Meyer <nils.meyer@ur.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -47,8 +41,8 @@ static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); // type traits giving the number of elements for each vector type template <typename T> struct W; @@ -83,12 +77,12 @@ namespace Optimization { typedef vec<uint16_t> vech; // half precision comms typedef vec<Integer> veci; -}} // Grid::Optimization - +NAMESPACE_END(Optimization) +NAMESPACE_END(Grid) // low-level API -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); template <typename T> struct acle{}; @@ -242,21 +236,16 @@ struct Vsplat{ } }; - struct Vstore{ - // Real - template <typename T> - inline void operator()(vec<T> a, T *D){ +struct Vstore{ + // Real + template <typename T> + inline void operator()(vec<T> a, T *D){ - svbool_t pg1 = acle<T>::pg1(); - typename acle<T>::vt a_v = svld1(pg1, (typename acle<T>::pt*)&a.v); - // NOTE illegal '&' here causes SIGBUS at runtime, related to CAS-35230-H2H6T1 - // svst1(pg1, (typename acle<T>::pt*)&D, a_v); - svst1(pg1, D, a_v); - - // non temporal version - //svstnt1(pg1, D, a_v); - } - }; + svbool_t pg1 = acle<T>::pg1(); + typename acle<T>::vt a_v = svld1(pg1, (typename acle<T>::pt*)&a.v); + svst1(pg1, D, a_v); + } +}; struct Vstream{ // Real @@ -265,7 +254,6 @@ struct Vsplat{ svbool_t pg1 = acle<T>::pg1(); typename acle<T>::vt b_v = svld1(pg1, b.v); - // FIXME non-temporal store causes compiler crash CAS-35230-H2H6T1 svstnt1(pg1, a, b_v); //svst1(pg1, a, b_v); } @@ -297,40 +285,40 @@ struct Vsplat{ } }; - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// - struct Sum{ - template <typename T> - inline vec<T> operator()(vec<T> a, vec<T> b){ +struct Sum{ + template <typename T> + inline vec<T> operator()(vec<T> a, vec<T> b){ - vec<T> out; - svbool_t pg1 = acle<T>::pg1(); - typename acle<T>::vt a_v = svld1(pg1, a.v); - typename acle<T>::vt b_v = svld1(pg1, b.v); - typename acle<T>::vt r_v = svadd_x(pg1, a_v, b_v); - svst1(pg1, out.v, r_v); + vec<T> out; + svbool_t pg1 = acle<T>::pg1(); + typename acle<T>::vt a_v = svld1(pg1, a.v); + typename acle<T>::vt b_v = svld1(pg1, b.v); + typename acle<T>::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; - struct Sub{ - template <typename T> - inline vec<T> operator()(vec<T> a, vec<T> b){ +struct Sub{ + template <typename T> + inline vec<T> operator()(vec<T> a, vec<T> b){ - vec<T> out; - svbool_t pg1 = acle<T>::pg1(); - typename acle<T>::vt a_v = svld1(pg1, a.v); - typename acle<T>::vt b_v = svld1(pg1, b.v); - typename acle<T>::vt r_v = svsub_x(pg1, a_v, b_v); - svst1(pg1, out.v, r_v); + vec<T> out; + svbool_t pg1 = acle<T>::pg1(); + typename acle<T>::vt a_v = svld1(pg1, a.v); + typename acle<T>::vt b_v = svld1(pg1, b.v); + typename acle<T>::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; struct Mult{ @@ -440,45 +428,45 @@ struct Conj{ }; - struct TimesMinusI{ - // Complex - template <typename T> - inline vec<T> operator()(vec<T> a, vec<T> b){ +struct TimesMinusI{ + // Complex + template <typename T> + inline vec<T> operator()(vec<T> a, vec<T> b){ - vec<T> out; - const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap(); - svbool_t pg1 = acle<T>::pg1(); - svbool_t pg_odd = acle<T>::pg_odd(); + vec<T> out; + const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap(); + svbool_t pg1 = acle<T>::pg1(); + svbool_t pg_odd = acle<T>::pg_odd(); - typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle<T>::vt a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); - typename acle<T>::vt r_v = svneg_x(pg_odd, a_v); - svst1(pg1, out.v, r_v); + typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle<T>::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle<T>::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; - struct TimesI{ - // Complex - template <typename T> - inline vec<T> operator()(vec<T> a, vec<T> b){ +struct TimesI{ + // Complex + template <typename T> + inline vec<T> operator()(vec<T> a, vec<T> b){ - vec<T> out; - const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap(); - svbool_t pg1 = acle<T>::pg1(); - svbool_t pg_even = acle<T>::pg_even(); + vec<T> out; + const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap(); + svbool_t pg1 = acle<T>::pg1(); + svbool_t pg_even = acle<T>::pg_even(); - typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle<T>::vt a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); - typename acle<T>::vt r_v = svneg_x(pg_even, a_v); - svst1(pg1, out.v, r_v); + typename acle<T>::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle<T>::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle<T>::vt r_v = svneg_x(pg_even, a_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; struct PrecisionChange { @@ -587,71 +575,71 @@ struct PrecisionChange { }; - struct Exchange{ +struct Exchange{ - // Exchange0 is valid for arbitrary SVE vector length - template <typename T> - static inline void Exchange0(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){ + // Exchange0 is valid for arbitrary SVE vector length + template <typename T> + static inline void Exchange0(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){ - svbool_t pg1 = acle<T>::pg1(); - typename acle<T>::vt a1_v = svld1(pg1, in1.v); - typename acle<T>::vt a2_v = svld1(pg1, in2.v); - typename acle<T>::vt r1_v = svext(a1_v, a1_v, (uint64_t)W<T>::c); - r1_v = svext(r1_v, a2_v, (uint64_t)W<T>::c); - typename acle<T>::vt r2_v = svext(a2_v, a2_v, (uint64_t)W<T>::c); - r2_v = svext(a1_v, r2_v, (uint64_t)W<T>::c); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } + svbool_t pg1 = acle<T>::pg1(); + typename acle<T>::vt a1_v = svld1(pg1, in1.v); + typename acle<T>::vt a2_v = svld1(pg1, in2.v); + typename acle<T>::vt r1_v = svext(a1_v, a1_v, (uint64_t)W<T>::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W<T>::c); + typename acle<T>::vt r2_v = svext(a2_v, a2_v, (uint64_t)W<T>::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W<T>::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } - template <typename T> - static inline void Exchange1(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){ + template <typename T> + static inline void Exchange1(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){ - svbool_t pg4 = acle<double>::pg4(); - typename acle<double>::vt4 in1_v4 = svld4(pg4, (typename acle<double>::pt*)in1.v); - typename acle<double>::vt4 in2_v4 = svld4(pg4, (typename acle<double>::pt*)in2.v); - typename acle<double>::vt4 out1_v4; - typename acle<double>::vt4 out2_v4; - out1_v4.v0 = in1_v4.v0; - out1_v4.v1 = in1_v4.v1; - out1_v4.v2 = in2_v4.v0; - out1_v4.v3 = in2_v4.v1; - out2_v4.v0 = in1_v4.v2; - out2_v4.v1 = in1_v4.v3; - out2_v4.v2 = in2_v4.v2; - out2_v4.v3 = in2_v4.v3; - svst4(pg4, (typename acle<double>::pt*)out1.v, out1_v4); - svst4(pg4, (typename acle<double>::pt*)out2.v, out2_v4); - } + svbool_t pg4 = acle<double>::pg4(); + typename acle<double>::vt4 in1_v4 = svld4(pg4, (typename acle<double>::pt*)in1.v); + typename acle<double>::vt4 in2_v4 = svld4(pg4, (typename acle<double>::pt*)in2.v); + typename acle<double>::vt4 out1_v4; + typename acle<double>::vt4 out2_v4; + out1_v4.v0 = in1_v4.v0; + out1_v4.v1 = in1_v4.v1; + out1_v4.v2 = in2_v4.v0; + out1_v4.v3 = in2_v4.v1; + out2_v4.v0 = in1_v4.v2; + out2_v4.v1 = in1_v4.v3; + out2_v4.v2 = in2_v4.v2; + out2_v4.v3 = in2_v4.v3; + svst4(pg4, (typename acle<double>::pt*)out1.v, out1_v4); + svst4(pg4, (typename acle<double>::pt*)out2.v, out2_v4); + } - template <typename T> - static inline void Exchange2(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){ + template <typename T> + static inline void Exchange2(vec<T> &out1, vec<T> &out2, const vec<T> &in1, const vec<T> &in2){ - svbool_t pg1 = acle<double>::pg1(); - typename acle<double>::vt a1_v = svld1(pg1, (typename acle<double>::pt*)in1.v); - typename acle<double>::vt a2_v = svld1(pg1, (typename acle<double>::pt*)in2.v); - typename acle<double>::vt r1_v = svtrn1(a1_v, a2_v); - typename acle<double>::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, (typename acle<double>::pt*)out1.v, r1_v); - svst1(pg1, (typename acle<double>::pt*)out2.v, r2_v); - } + svbool_t pg1 = acle<double>::pg1(); + typename acle<double>::vt a1_v = svld1(pg1, (typename acle<double>::pt*)in1.v); + typename acle<double>::vt a2_v = svld1(pg1, (typename acle<double>::pt*)in2.v); + typename acle<double>::vt r1_v = svtrn1(a1_v, a2_v); + typename acle<double>::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle<double>::pt*)out1.v, r1_v); + svst1(pg1, (typename acle<double>::pt*)out2.v, r2_v); + } - static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ - svbool_t pg1 = acle<float>::pg1(); - typename acle<float>::vt a1_v = svld1(pg1, in1.v); - typename acle<float>::vt a2_v = svld1(pg1, in2.v); - typename acle<float>::vt r1_v = svtrn1(a1_v, a2_v); - typename acle<float>::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } + svbool_t pg1 = acle<float>::pg1(); + typename acle<float>::vt a1_v = svld1(pg1, in1.v); + typename acle<float>::vt a2_v = svld1(pg1, in2.v); + typename acle<float>::vt r1_v = svtrn1(a1_v, a2_v); + typename acle<float>::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } - static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ - assert(0); - return; - } + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } }; @@ -780,7 +768,7 @@ struct Rotate{ }; // ======================================================================= -/* SVE ACLE reducedoes not compile, check later +// SVE ACLE reduce does not compile, check later // tree-based reduction #define svred(pg, v)\ @@ -864,11 +852,11 @@ inline Integer Reduce<Integer, veci>::operator()(veci in){ } #undef svred -*/ +// */ // ======================================================================= - +/* #define acc(v, a, off, step, n)\ for (unsigned int i = off; i < n; i += step)\ {\ @@ -939,39 +927,39 @@ inline Integer Reduce<Integer, veci>::operator()(veci in){ } #undef acc // EIGEN compatibility +*/ - -} // Optimization +NAMESPACE_END(Optimization) ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef Optimization::vech SIMD_Htype; // Reduced precision type - typedef Optimization::vecf SIMD_Ftype; // Single precision type - typedef Optimization::vecd SIMD_Dtype; // Double precision type - typedef Optimization::veci SIMD_Itype; // Integer type +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){}; +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>; +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>; - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; -} +NAMESPACE_END(Grid)