From 5f8a76d490ecbc7c1b3a7ff8bfa9e5888f8397df Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 3 Apr 2020 19:18:24 +0200 Subject: [PATCH] clean up, reduction in acle --- Grid/simd/Grid_a64fx-2.h | 334 +++++++++++++++++++-------------------- 1 file changed, 161 insertions(+), 173 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 1bb67179..6968ca7a 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -2,17 +2,11 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: ./lib/simd/Grid_a64fx-1.h + Source file: Grid_a64fx-2.h Copyright (C) 2020 -Author: Nils Meyer - - Copyright (C) 2015 - Copyright (C) 2017 - -Author: Antonin Portelli - Andrew Lawson + Author: Nils Meyer This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -47,8 +41,8 @@ static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); #pragma error "Missing SVE feature" #endif /* __ARM_FEATURE_SVE */ -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); // type traits giving the number of elements for each vector type template struct W; @@ -83,12 +77,12 @@ namespace Optimization { typedef vec vech; // half precision comms typedef vec veci; -}} // Grid::Optimization - +NAMESPACE_END(Optimization) +NAMESPACE_END(Grid) // low-level API -namespace Grid { -namespace Optimization { +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); template struct acle{}; @@ -242,21 +236,16 @@ struct Vsplat{ } }; - struct Vstore{ - // Real - template - inline void operator()(vec a, T *D){ +struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); - // NOTE illegal '&' here causes SIGBUS at runtime, related to CAS-35230-H2H6T1 - // svst1(pg1, (typename acle::pt*)&D, a_v); - svst1(pg1, D, a_v); - - // non temporal version - //svstnt1(pg1, D, a_v); - } - }; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a_v); + } +}; struct Vstream{ // Real @@ -265,7 +254,6 @@ struct Vsplat{ svbool_t pg1 = acle::pg1(); typename acle::vt b_v = svld1(pg1, b.v); - // FIXME non-temporal store causes compiler crash CAS-35230-H2H6T1 svstnt1(pg1, a, b_v); //svst1(pg1, a, b_v); } @@ -297,40 +285,40 @@ struct Vsplat{ } }; - ///////////////////////////////////////////////////// - // Arithmetic operations - ///////////////////////////////////////////////////// +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// - struct Sum{ - template - inline vec operator()(vec a, vec b){ +struct Sum{ + template + inline vec operator()(vec a, vec b){ - vec out; - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a.v); - typename acle::vt b_v = svld1(pg1, b.v); - typename acle::vt r_v = svadd_x(pg1, a_v, b_v); - svst1(pg1, out.v, r_v); + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; - struct Sub{ - template - inline vec operator()(vec a, vec b){ +struct Sub{ + template + inline vec operator()(vec a, vec b){ - vec out; - svbool_t pg1 = acle::pg1(); - typename acle::vt a_v = svld1(pg1, a.v); - typename acle::vt b_v = svld1(pg1, b.v); - typename acle::vt r_v = svsub_x(pg1, a_v, b_v); - svst1(pg1, out.v, r_v); + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; struct Mult{ @@ -440,45 +428,45 @@ struct Conj{ }; - struct TimesMinusI{ - // Complex - template - inline vec operator()(vec a, vec b){ +struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ - vec out; - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_odd = acle::pg_odd(); + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_odd, a_v); - svst1(pg1, out.v, r_v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_odd, a_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; - struct TimesI{ - // Complex - template - inline vec operator()(vec a, vec b){ +struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ - vec out; - const vec::uint> tbl_swap = acle::tbl_swap(); - svbool_t pg1 = acle::pg1(); - svbool_t pg_even = acle::pg_even(); + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); - typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); - typename acle::vt a_v = svld1(pg1, a.v); - a_v = svtbl(a_v, tbl_swap_v); - typename acle::vt r_v = svneg_x(pg_even, a_v); - svst1(pg1, out.v, r_v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_x(pg_even, a_v); + svst1(pg1, out.v, r_v); - return out; - } - }; + return out; + } +}; struct PrecisionChange { @@ -587,71 +575,71 @@ struct PrecisionChange { }; - struct Exchange{ +struct Exchange{ - // Exchange0 is valid for arbitrary SVE vector length - template - static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, in1.v); - typename acle::vt a2_v = svld1(pg1, in2.v); - typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); - r1_v = svext(r1_v, a2_v, (uint64_t)W::c); - typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); - r2_v = svext(a1_v, r2_v, (uint64_t)W::c); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } - template - static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg4 = acle::pg4(); - typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); - typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); - typename acle::vt4 out1_v4; - typename acle::vt4 out2_v4; - out1_v4.v0 = in1_v4.v0; - out1_v4.v1 = in1_v4.v1; - out1_v4.v2 = in2_v4.v0; - out1_v4.v3 = in2_v4.v1; - out2_v4.v0 = in1_v4.v2; - out2_v4.v1 = in1_v4.v3; - out2_v4.v2 = in2_v4.v2; - out2_v4.v3 = in2_v4.v3; - svst4(pg4, (typename acle::pt*)out1.v, out1_v4); - svst4(pg4, (typename acle::pt*)out2.v, out2_v4); - } + svbool_t pg4 = acle::pg4(); + typename acle::vt4 in1_v4 = svld4(pg4, (typename acle::pt*)in1.v); + typename acle::vt4 in2_v4 = svld4(pg4, (typename acle::pt*)in2.v); + typename acle::vt4 out1_v4; + typename acle::vt4 out2_v4; + out1_v4.v0 = in1_v4.v0; + out1_v4.v1 = in1_v4.v1; + out1_v4.v2 = in2_v4.v0; + out1_v4.v3 = in2_v4.v1; + out2_v4.v0 = in1_v4.v2; + out2_v4.v1 = in1_v4.v3; + out2_v4.v2 = in2_v4.v2; + out2_v4.v3 = in2_v4.v3; + svst4(pg4, (typename acle::pt*)out1.v, out1_v4); + svst4(pg4, (typename acle::pt*)out2.v, out2_v4); + } - template - static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + template + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); - typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); - typename acle::vt r1_v = svtrn1(a1_v, a2_v); - typename acle::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, (typename acle::pt*)out1.v, r1_v); - svst1(pg1, (typename acle::pt*)out2.v, r2_v); - } + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } - static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ - svbool_t pg1 = acle::pg1(); - typename acle::vt a1_v = svld1(pg1, in1.v); - typename acle::vt a2_v = svld1(pg1, in2.v); - typename acle::vt r1_v = svtrn1(a1_v, a2_v); - typename acle::vt r2_v = svtrn2(a1_v, a2_v); - svst1(pg1, out1.v, r1_v); - svst1(pg1, out2.v, r2_v); - } + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } - static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ - assert(0); - return; - } + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } }; @@ -780,7 +768,7 @@ struct Rotate{ }; // ======================================================================= -/* SVE ACLE reducedoes not compile, check later +// SVE ACLE reduce does not compile, check later // tree-based reduction #define svred(pg, v)\ @@ -864,11 +852,11 @@ inline Integer Reduce::operator()(veci in){ } #undef svred -*/ +// */ // ======================================================================= - +/* #define acc(v, a, off, step, n)\ for (unsigned int i = off; i < n; i += step)\ {\ @@ -939,39 +927,39 @@ inline Integer Reduce::operator()(veci in){ } #undef acc // EIGEN compatibility +*/ - -} // Optimization +NAMESPACE_END(Optimization) ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef Optimization::vech SIMD_Htype; // Reduced precision type - typedef Optimization::vecf SIMD_Ftype; // Single precision type - typedef Optimization::vecd SIMD_Dtype; // Double precision type - typedef Optimization::veci SIMD_Itype; // Integer type +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type - // prefetch utilities - inline void v_prefetch0(int size, const char *ptr){}; - inline void prefetch_HINT_T0(const char *ptr){}; +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Div DivSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::MultRealPart MultRealPartSIMD; - typedef Optimization::MaddRealPart MaddRealPartSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; -} +NAMESPACE_END(Grid)