1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7

This commit is contained in:
nmeyer-ur 2020-06-12 14:49:19 +02:00
parent 433766ac62
commit 2111052fbe
2 changed files with 87 additions and 24 deletions

View File

@ -57,11 +57,39 @@ NAMESPACE_BEGIN(Optimization);
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
};
#ifdef ARMCLANGCOMPAT
// SIMD vector immediate types
template <typename T>
struct vec_imm {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
};
// SIMD vector types
template <typename T>
struct vec {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
vec() = default;
vec(const vec &rhs) { this->operator=(rhs); }
vec(const vec_imm<T> &rhs) {
// v = rhs.v
svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
}
inline vec &operator=(const vec &rhs) {
// v = rhs.v
svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v));
return *this;
};
};
#else // not defines ARMCLANGCOMPAT
#define vec_imm vec
// SIMD vector types
template <typename T>
struct vec {
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
};
#endif
typedef vec<float> vecf;
typedef vec<double> vecd;
@ -91,27 +119,33 @@ struct acle<double>{
static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);}
static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);}
static inline vec<uint64_t> tbl_swap(){
const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
//const vec<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
const vec_imm<uint64_t> t = {1, 0, 3, 2, 5, 4, 7, 6};
return t;
}
static inline vec<uint64_t> tbl0(){
const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
//const vec<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
const vec_imm<uint64_t> t = {4, 5, 6, 7, 0, 1, 2, 3};
return t;
}
static inline vec<uint64_t> tbl1(){
const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
//const vec<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
const vec_imm<uint64_t> t = {2, 3, 0, 1, 6, 7, 4, 5};
return t;
}
static inline vec<uint64_t> tbl_exch1a(){ // Exchange1
const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
//const vec<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
const vec_imm<uint64_t> t = {0, 1, 4, 5, 2, 3, 6, 7};
return t;
}
static inline vec<uint64_t> tbl_exch1b(){ // Exchange1
const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
//const vec<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
const vec_imm<uint64_t> t = {2, 3, 6, 7, 0, 1, 4, 5};
return t;
}
static inline vec<uint64_t> tbl_exch1c(){ // Exchange1
const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
//const vec<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
const vec_imm<uint64_t> t = {4, 5, 0, 1, 6, 7, 2, 3};
return t;
}
static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());}

View File

@ -122,7 +122,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
#if defined(A64FX) // VLA
#pragma message("building for A64FX / SVE ACLE VLA")
#if defined(ARMCLANGCOMPAT)
#pragma message("applying armclang fix")
#pragma message("applying armclang patch")
#endif
#include "Grid_a64fx-2.h"
#endif
@ -247,21 +247,37 @@ public:
return sizeof(Vector_type) / sizeof(Scalar_type);
}
#ifdef ARMCLANGCOMPAT
accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v));
svst1(svptrue_b8(), (int8_t*)this, tmp);
#ifdef ARMCLANGCOMPAT
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
return *this;
};
accelerator_inline Grid_simd &operator=(const Grid_simd &rhs) {
svint8_t tmp = svld1(svptrue_b8(), (int8_t*)&(rhs.v));
svst1(svptrue_b8(), (int8_t*)this, tmp);
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v)));
return *this;
};
#else
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
return *this;
};
template <class S = Scalar_type>
accelerator_inline Grid_simd &operator=(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) {
//v = rhs.v;
svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v)));
return *this;
};
#else
accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) {
v = rhs.v;
return *this;
@ -270,11 +286,24 @@ public:
v = rhs.v;
return *this;
}; // faster than not declaring it and leaving to the compiler
#endif
#endif
accelerator Grid_simd() = default;
#ifdef ARMCLANGCOMPAT
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<!is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); }
template <class S = Scalar_type>
accelerator_inline Grid_simd(const Grid_simd<typename std::enable_if<is_complex<S>::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); }
#else
accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
#endif
accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
// Enable if complex type
template <typename S = Scalar_type> accelerator_inline