From 2111052fbe5b9623ca8033e9f04f9b5d5efd4461 Mon Sep 17 00:00:00 2001
From: nmeyer-ur <nils.meyer@ur.de>
Date: Fri, 12 Jun 2020 14:49:19 +0200
Subject: [PATCH] apply VLA patch for memcpy reduction suggested by Arm,
 CAS-162542-D6W7Z7

---
 Grid/simd/Grid_a64fx-2.h      | 46 +++++++++++++++++++++----
 Grid/simd/Grid_vector_types.h | 65 +++++++++++++++++++++++++----------
 2 files changed, 87 insertions(+), 24 deletions(-)
diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h
index a0463a10..6f7229ec 100644
--- a/Grid/simd/Grid_a64fx-2.h
+++ b/Grid/simd/Grid_a64fx-2.h
@@ -57,11 +57,39 @@ NAMESPACE_BEGIN(Optimization);
     constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
   };
 
+  #ifdef ARMCLANGCOMPAT
+  // SIMD vector immediate types
+  template <typename T>
+  struct vec_imm {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  };
+
+  // SIMD vector types
+  template <typename T>
+  struct vec {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+    vec() = default;
+    vec(const vec &rhs) { this->operator=(rhs); }
+    vec(const vec_imm<T> &rhs) {
+      // v = rhs.v
+      svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
+    }
+
+    inline vec &operator=(const vec &rhs) {
+      // v = rhs.v
+      svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
+      return *this;
+    };
+  };
+
+  #else // not defines ARMCLANGCOMPAT
+  #define vec_imm vec
   // SIMD vector types
   template <typename T>
   struct vec {
     alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
   };
+  #endif
 
   typedef vec<float>     vecf;
   typedef vec<double>    vecd;
@@ -91,27 +119,33 @@ struct acle<double>{
   static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);}
   static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);}
   static inline vec<uint64_t> tbl_swap(){
-    const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
+    //const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
+    const vec_imm<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
     return t;
   }
   static inline vec<uint64_t> tbl0(){
-    const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
+    //const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
+    const vec_imm<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
     return t;
   }
   static inline vec<uint64_t> tbl1(){
-    const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
+    //const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
+    const vec_imm<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
     return t;
   }
   static inline vec<uint64_t> tbl_exch1a(){ // Exchange1
-    const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
+    //const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
+    const vec_imm<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
     return t;
   }
   static inline vec<uint64_t> tbl_exch1b(){ // Exchange1
-    const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
+    //const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
+    const vec_imm<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
     return t;
   }
   static inline vec<uint64_t> tbl_exch1c(){ // Exchange1
-    const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
+    //const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
+    const vec_imm<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
     return t;
   }
   static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}
diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h
index e1eb330d..527fde18 100644
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -122,7 +122,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
     #if defined(A64FX) // VLA
       #pragma message("building for A64FX / SVE ACLE VLA")
       #if defined(ARMCLANGCOMPAT)
-        #pragma message("applying armclang fix")
+        #pragma message("applying armclang patch")
       #endif
       #include "Grid_a64fx-2.h"
     #endif
@@ -247,21 +247,37 @@ public:
     return sizeof(Vector_type) / sizeof(Scalar_type);
   }
 
-#ifdef ARMCLANGCOMPAT
-  accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
-    svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v));
-    svst1(svptrue_b8(), (int8_t*)this, tmp);
-    //v = rhs.v;
-    return *this;
-  };
+  #ifdef ARMCLANGCOMPAT
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) {
+      //v = rhs.v;
+      svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
+      return *this;
+    };
+
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) {
+      //v = rhs.v;
+      svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
+      return *this;
+    };
+
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) {
+      //v = rhs.v;
+      svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
+      return *this;
+    };
+
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) {
+      //v = rhs.v;
+      svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
+      return *this;
+    };
+
+  #else
 
-  accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) {
-    svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v));
-    svst1(svptrue_b8(), (int8_t*)this, tmp);
-    //v = rhs.v;
-    return *this;
-  };
-#else
   accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
     v = rhs.v;
     return *this;
@@ -270,11 +286,24 @@ public:
     v = rhs.v;
     return *this;
   };  // faster than not declaring it and leaving to the compiler
-#endif
+
+  #endif
 
   accelerator Grid_simd() = default;
-  accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
-  accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
+
+  #ifdef ARMCLANGCOMPAT
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
+    template <class S = Scalar_type>
+    accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
+  #else
+    accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
+    accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
+  #endif
   accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
   // Enable if complex type
   template <typename S = Scalar_type> accelerator_inline