From 2111052fbe5b9623ca8033e9f04f9b5d5efd4461 Mon Sep 17 00:00:00 2001 From: nmeyer-ur Date: Fri, 12 Jun 2020 14:49:19 +0200 Subject: [PATCH] apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7 --- Grid/simd/Grid_a64fx-2.h | 46 +++++++++++++++++++++---- Grid/simd/Grid_vector_types.h | 65 +++++++++++++++++++++++++---------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index a0463a10..6f7229ec 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -57,11 +57,39 @@ NAMESPACE_BEGIN(Optimization); constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; }; + #ifdef ARMCLANGCOMPAT + // SIMD vector immediate types + template + struct vec_imm { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + vec() = default; + vec(const vec &rhs) { this->operator=(rhs); } + vec(const vec_imm &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + } + + inline vec &operator=(const vec &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + return *this; + }; + }; + + #else // not defines ARMCLANGCOMPAT + #define vec_imm vec // SIMD vector types template struct vec { alignas(GEN_SIMD_WIDTH) T v[W::r]; }; + #endif typedef vec vecf; typedef vec vecd; @@ -91,27 +119,33 @@ struct acle{ static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} static inline vec tbl_swap(){ - const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6}; return t; } static inline vec tbl0(){ - const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3}; return t; } static inline vec tbl1(){ - const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5}; return t; } static inline vec tbl_exch1a(){ // Exchange1 - const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + //const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + const vec_imm t = {0, 1, 4, 5, 2, 3, 6, 7}; return t; } static inline vec tbl_exch1b(){ // Exchange1 - const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + //const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + const vec_imm t = {2, 3, 6, 7, 0, 1, 4, 5}; return t; } static inline vec tbl_exch1c(){ // Exchange1 - const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + //const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + const vec_imm t = {4, 5, 0, 1, 6, 7, 2, 3}; return t; } static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e1eb330d..527fde18 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -122,7 +122,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #if defined(A64FX) // VLA #pragma message("building for A64FX / SVE ACLE VLA") #if defined(ARMCLANGCOMPAT) - #pragma message("applying armclang fix") + #pragma message("applying armclang patch") #endif #include "Grid_a64fx-2.h" #endif @@ -247,21 +247,37 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } -#ifdef ARMCLANGCOMPAT - accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { - svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); - svst1(svptrue_b8(), (int8_t*)this, tmp); - //v = rhs.v; - return *this; - }; + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + + #else - accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) { - svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v)); - svst1(svptrue_b8(), (int8_t*)this, tmp); - //v = rhs.v; - return *this; - }; -#else accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { v = rhs.v; return *this; @@ -270,11 +286,24 @@ public: v = rhs.v; return *this; }; // faster than not declaring it and leaving to the compiler -#endif + + #endif accelerator Grid_simd() = default; - accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps - accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + #else + accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps + accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + #endif accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; // Enable if complex type template accelerator_inline