1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Merge remote-tracking branch 'grid/develop' into feature/arm-neon

This commit is contained in:
Nils Meyer
2017-08-29 17:47:36 +02:00
165 changed files with 8491 additions and 5406 deletions

View File

@ -701,9 +701,28 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
__m128i ret;
#if defined (AVX2)
// AVX2 horizontal adds within upper and lower halves of register; use
// SSE to add upper and lower halves for result.
__m256i v1, v2;
__m128i u1, u2;
v1 = _mm256_hadd_epi32(in, in);
v2 = _mm256_hadd_epi32(v1, v1);
u1 = _mm256_castsi256_si128(v2); // upper half
u2 = _mm256_extracti128_si256(v2, 1); // lower half
ret = _mm_add_epi32(u1, u2);
#else
// No AVX horizontal add; extract upper and lower halves of register & use
// SSE intrinsics.
__m128i u1, u2, u3;
u1 = _mm256_extractf128_si256(in, 0); // upper half
u2 = _mm256_extractf128_si256(in, 1); // lower half
u3 = _mm_add_epi32(u1, u2);
u1 = _mm_hadd_epi32(u3, u3);
ret = _mm_hadd_epi32(u1, u1);
#endif
return _mm_cvtsi128_si32(ret);
}
}

View File

@ -543,6 +543,24 @@ namespace Optimization {
u512d conv; conv.v = v1;
return conv.f[0];
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// No full vector reduce, use AVX to add upper and lower halves of register
// and perform AVX reduction.
__m256i v1, v2, v3;
__m128i u1, u2, ret;
v1 = _mm512_castsi512_si256(in); // upper half
v2 = _mm512_extracti32x8_epi32(in, 1); // lower half
v3 = _mm256_add_epi32(v1, v2);
v1 = _mm256_hadd_epi32(v3, v3);
v2 = _mm256_hadd_epi32(v1, v1);
u1 = _mm256_castsi256_si128(v2) // upper half
u2 = _mm256_extracti128_si256(v2, 1); // lower half
ret = _mm_add_epi32(u1, u2);
return _mm_cvtsi128_si32(ret);
}
#else
//Complex float Reduce
template<>
@ -570,9 +588,7 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
return _mm512_reduce_add_epi32(in);
}
#endif

View File

@ -401,9 +401,7 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
return _mm512_reduce_add_epi32(in);
}

View File

@ -596,3 +596,4 @@ namespace Optimization {
typedef Optimization::TimesI TimesISIMD;
}

View File

@ -374,6 +374,84 @@ namespace Optimization {
// Complex float
FLOAT_WRAP_2(operator(), inline)
};
#define USE_FP16
struct PrecisionChange {
static inline vech StoH (const vector4float &a, const vector4float &b) {
vech ret;
std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void HtoS (vech h, vector4float &sa, vector4float &sb) {
std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
assert(0);
}
static inline vector4float DtoS (vector4double a, vector4double b) {
vector4float ret;
std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
assert(0);
}
static inline vech DtoH (vector4double a, vector4double b,
vector4double c, vector4double d) {
vech ret;
std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void HtoD (vech h, vector4double &a, vector4double &b,
vector4double &c, vector4double &d) {
std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
assert(0);
}
};
//////////////////////////////////////////////
// Exchange support
#define FLOAT_WRAP_EXCHANGE(fn) \
static inline void fn(vector4float &out1, vector4float &out2, \
vector4float in1, vector4float in2) \
{ \
vector4double out1d, out2d, in1d, in2d; \
in1d = Vset()(in1); \
in2d = Vset()(in2); \
fn(out1d, out2d, in1d, in2d); \
Vstore()(out1d, out1); \
Vstore()(out2d, out2); \
}
struct Exchange{
// double precision
static inline void Exchange0(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
out1 = vec_perm(in1, in2, vec_gpci(0145));
out2 = vec_perm(in1, in2, vec_gpci(02367));
}
static inline void Exchange1(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
out1 = vec_perm(in1, in2, vec_gpci(0426));
out2 = vec_perm(in1, in2, vec_gpci(01537));
}
static inline void Exchange2(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
assert(0);
}
static inline void Exchange3(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
assert(0);
}
// single precision
FLOAT_WRAP_EXCHANGE(Exchange0);
FLOAT_WRAP_EXCHANGE(Exchange1);
FLOAT_WRAP_EXCHANGE(Exchange2);
FLOAT_WRAP_EXCHANGE(Exchange3);
};
struct Permute{
//Complex double
@ -497,15 +575,19 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, int>::operator()(int in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
inline Integer Reduce<Integer, veci>::operator()(veci in){
Integer a = 0;
for (unsigned int i = 0; i < W<Integer>::r; ++i)
{
a += in.v[i];
}
return a;
}
}
////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef Optimization::vech SIMD_Htype; // Half precision type
typedef Optimization::vector4float SIMD_Ftype; // Single precision type
typedef vector4double SIMD_Dtype; // Double precision type
typedef Optimization::veci SIMD_Itype; // Integer type

View File

@ -570,9 +570,9 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
__m128i v1 = _mm_hadd_epi32(in, in);
__m128i v2 = _mm_hadd_epi32(v1, v1);
return _mm_cvtsi128_si32(v2);
}
}

View File

@ -53,7 +53,7 @@ directory
#if defined IMCI
#include "Grid_imci.h"
#endif
#ifdef NEONv8
#ifdef NEONV8
#include "Grid_neon.h"
#endif
#if defined QPX

View File

@ -179,13 +179,6 @@ inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
////////////////////////////////////////////////////////////////////////////
// Allows us to assign into **conformable** real vectors from complex
////////////////////////////////////////////////////////////////////////////
// template < class S, class V >
// inline auto ComplexRemove(const Grid_simd<S,V> &c) ->
// Grid_simd<Grid_simd<S,V>::Real,V> {
// Grid_simd<Grid_simd<S,V>::Real,V> ret;
// ret.v = c.v;
// return ret;
// }
template <class scalar>
struct AndFunctor {
scalar operator()(const scalar &x, const scalar &y) const { return x & y; }