apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7

2025-11-19 05:59:31 +00:00 · 2020-06-12 14:49:19 +02:00
parent 433766ac62
commit 2111052fbe
2 changed files with 87 additions and 24 deletions
--- a/Grid/simd/Grid_a64fx-2.h
+++ b/Grid/simd/Grid_a64fx-2.h
@@ -57,11 +57,39 @@ NAMESPACE_BEGIN(Optimization);
    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
  };

+  #ifdef ARMCLANGCOMPAT
+  // SIMD vector immediate types
+  template <typename T>
+  struct vec_imm {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  };
+
+  // SIMD vector types
+  template <typename T>
+  struct vec {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+    vec() = default;
+    vec(const vec &rhs) { this->operator=(rhs); }
+    vec(const vec_imm<T> &rhs) {
+      // v = rhs.v
+      svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
+    }
+
+    inline vec &operator=(const vec &rhs) {
+      // v = rhs.v
+      svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
+      return *this;
+    };
+  };
+
+  #else // not defines ARMCLANGCOMPAT
+  #define vec_imm vec
  // SIMD vector types
  template <typename T>
  struct vec {
    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
  };
+  #endif

  typedef vec<float>     vecf;
  typedef vec<double>    vecd;
@@ -91,27 +119,33 @@ struct acle<double>{
  static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);}
  static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);}
  static inline vec<uint64_t> tbl_swap(){
-    const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
+    //const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
+    const vec_imm<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
    return t;
  }
  static inline vec<uint64_t> tbl0(){
-    const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
+    //const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
+    const vec_imm<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
    return t;
  }
  static inline vec<uint64_t> tbl1(){
-    const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
+    //const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
+    const vec_imm<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
    return t;
  }
  static inline vec<uint64_t> tbl_exch1a(){ // Exchange1
-    const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
+    //const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
+    const vec_imm<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
    return t;
  }
  static inline vec<uint64_t> tbl_exch1b(){ // Exchange1
-    const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
+    //const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
+    const vec_imm<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
    return t;
  }
  static inline vec<uint64_t> tbl_exch1c(){ // Exchange1
-    const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
+    //const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
+    const vec_imm<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
    return t;
  }
  static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -122,7 +122,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
    #if defined(A64FX) // VLA
      #pragma message("building for A64FX / SVE ACLE VLA")
      #if defined(ARMCLANGCOMPAT)
-        #pragma message("applying armclang fix")
+        #pragma message("applying armclang patch")
      #endif
      #include "Grid_a64fx-2.h"
    #endif
@@ -248,20 +248,36 @@ public:
  }

  #ifdef ARMCLANGCOMPAT
-  accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
-    svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v));
-    svst1(svptrue_b8(), (int8_t*)this, tmp);
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) {
      //v = rhs.v;
+      svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
      return *this;
    };

-  accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) {
-    svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v));
-    svst1(svptrue_b8(), (int8_t*)this, tmp);
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) {
      //v = rhs.v;
+      svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
      return *this;
    };
+
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) {
+      //v = rhs.v;
+      svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
+      return *this;
+    };
+
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) {
+      //v = rhs.v;
+      svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
+      return *this;
+    };
+
  #else
+
  accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
    v = rhs.v;
    return *this;
@@ -270,11 +286,24 @@ public:
    v = rhs.v;
    return *this;
  };  // faster than not declaring it and leaving to the compiler
+
  #endif

  accelerator Grid_simd() = default;
+
+  #ifdef ARMCLANGCOMPAT
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
+  #else
    accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
    accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
+  #endif
  accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
  // Enable if complex type
  template <typename S = Scalar_type> accelerator_inline