1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-27 22:25:56 +01:00

NAMESPACE

This commit is contained in:
paboyle 2018-01-12 18:24:16 +00:00
parent 6ab744c720
commit ec89714cce

View File

@ -25,8 +25,8 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
/* /*
@ -45,29 +45,29 @@
#include "Grid_generic_types.h" #include "Grid_generic_types.h"
#include <arm_neon.h> #include <arm_neon.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace Optimization { NAMESPACE_BEGIN(Optimization);
template<class vtype> template<class vtype>
union uconv { union uconv {
float32x4_t f; float32x4_t f;
vtype v; vtype v;
}; };
union u128f { union u128f {
float32x4_t v; float32x4_t v;
float f[4]; float f[4];
}; };
union u128d { union u128d {
float64x2_t v; float64x2_t v;
double f[2]; double f[2];
}; };
// half precision // half precision
union u128h { union u128h {
float16x8_t v; float16x8_t v;
uint16_t f[8]; uint16_t f[8];
}; };
struct Vsplat{ struct Vsplat{
//Complex float //Complex float
inline float32x4_t operator()(float a, float b){ inline float32x4_t operator()(float a, float b){
float tmp[4]={a,b,a,b}; float tmp[4]={a,b,a,b};
@ -90,9 +90,9 @@ namespace Optimization {
inline uint32x4_t operator()(Integer a){ inline uint32x4_t operator()(Integer a){
return vdupq_n_u32(a); return vdupq_n_u32(a);
} }
}; };
struct Vstore{ struct Vstore{
//Float //Float
inline void operator()(float32x4_t a, float* F){ inline void operator()(float32x4_t a, float* F){
vst1q_f32(F, a); vst1q_f32(F, a);
@ -106,9 +106,9 @@ namespace Optimization {
vst1q_u32(I, a); vst1q_u32(I, a);
} }
}; };
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
//Float // N:generic //Float // N:generic
inline void operator()(float * a, float32x4_t b){ inline void operator()(float * a, float32x4_t b){
memcpy(a,&b,4*sizeof(float)); memcpy(a,&b,4*sizeof(float));
@ -117,13 +117,11 @@ namespace Optimization {
inline void operator()(double * a, float64x2_t b){ inline void operator()(double * a, float64x2_t b){
memcpy(a,&b,2*sizeof(double)); memcpy(a,&b,2*sizeof(double));
} }
};
// Nils: Vset untested; not used currently in Grid at all;
}; // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{
// Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{
// Complex float // Complex float
inline float32x4_t operator()(Grid::ComplexF *a){ inline float32x4_t operator()(Grid::ComplexF *a){
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
@ -148,10 +146,10 @@ namespace Optimization {
inline uint32x4_t operator()(Integer *a){ inline uint32x4_t operator()(Integer *a){
return vld1q_dup_u32(a); return vld1q_dup_u32(a);
} }
}; };
template <typename Out_type, typename In_type> template <typename Out_type, typename In_type>
struct Reduce{ struct Reduce{
//Need templated class to overload output type //Need templated class to overload output type
//General form must generate error if compiled //General form must generate error if compiled
inline Out_type operator()(In_type in){ inline Out_type operator()(In_type in){
@ -159,12 +157,12 @@ namespace Optimization {
exit(1); exit(1);
return 0; return 0;
} }
}; };
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Arithmetic operations // Arithmetic operations
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
struct Sum{ struct Sum{
//Complex/Real float //Complex/Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vaddq_f32(a,b); return vaddq_f32(a,b);
@ -177,9 +175,9 @@ namespace Optimization {
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return vaddq_u32(a,b); return vaddq_u32(a,b);
} }
}; };
struct Sub{ struct Sub{
//Complex/Real float //Complex/Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vsubq_f32(a,b); return vsubq_f32(a,b);
@ -192,9 +190,9 @@ namespace Optimization {
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return vsubq_u32(a,b); return vsubq_u32(a,b);
} }
}; };
struct MultRealPart{ struct MultRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t re = vtrn1q_f32(a, a); float32x4_t re = vtrn1q_f32(a, a);
return vmulq_f32(re, b); return vmulq_f32(re, b);
@ -203,9 +201,9 @@ namespace Optimization {
float64x2_t re = vzip1q_f64(a, a); float64x2_t re = vzip1q_f64(a, a);
return vmulq_f64(re, b); return vmulq_f64(re, b);
} }
}; };
struct MaddRealPart{ struct MaddRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
float32x4_t re = vtrn1q_f32(a, a); float32x4_t re = vtrn1q_f32(a, a);
return vfmaq_f32(c, re, b); return vfmaq_f32(c, re, b);
@ -214,9 +212,9 @@ namespace Optimization {
float64x2_t re = vzip1q_f64(a, a); float64x2_t re = vzip1q_f64(a, a);
return vfmaq_f64(c, re, b); return vfmaq_f64(c, re, b);
} }
}; };
struct Div{ struct Div{
// Real float // Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vdivq_f32(a, b); return vdivq_f32(a, b);
@ -225,9 +223,9 @@ namespace Optimization {
inline float64x2_t operator()(float64x2_t a, float64x2_t b){ inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vdivq_f64(a, b); return vdivq_f64(a, b);
} }
}; };
struct MultComplex{ struct MultComplex{
// Complex float // Complex float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){ inline float32x4_t operator()(float32x4_t a, float32x4_t b){
@ -275,9 +273,9 @@ namespace Optimization {
// r5 = vmulq_f64(r0, a); // r5 = vmulq_f64(r0, a);
// return vaddq_f64(r4, r5); // return vaddq_f64(r4, r5);
} }
}; };
struct Mult{ struct Mult{
// Real float // Real float
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
//return vaddq_f32(vmulq_f32(b,c),a); //return vaddq_f32(vmulq_f32(b,c),a);
@ -298,9 +296,9 @@ namespace Optimization {
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){ inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return vmulq_u32(a,b); return vmulq_u32(a,b);
} }
}; };
struct Conj{ struct Conj{
// Complex single // Complex single
inline float32x4_t operator()(float32x4_t in){ inline float32x4_t operator()(float32x4_t in){
// ar ai br bi -> ar -ai br -bi // ar ai br bi -> ar -ai br -bi
@ -318,9 +316,9 @@ namespace Optimization {
return vextq_f64(r0, r1, 1); // ar -ai return vextq_f64(r0, r1, 1); // ar -ai
} }
// do not define for integer input // do not define for integer input
}; };
struct TimesMinusI{ struct TimesMinusI{
//Complex single //Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// ar ai br bi -> ai -ar ai -br // ar ai br bi -> ai -ar ai -br
@ -336,9 +334,9 @@ namespace Optimization {
tmp = vnegq_f64(in); tmp = vnegq_f64(in);
return vextq_f64(in, tmp, 1); return vextq_f64(in, tmp, 1);
} }
}; };
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// ar ai br bi -> -ai ar -bi br // ar ai br bi -> -ai ar -bi br
@ -354,9 +352,9 @@ namespace Optimization {
tmp = vnegq_f64(in); tmp = vnegq_f64(in);
return vextq_f64(tmp, in, 1); return vextq_f64(tmp, in, 1);
} }
}; };
struct Permute{ struct Permute{
static inline float32x4_t Permute0(float32x4_t in){ // N:ok static inline float32x4_t Permute0(float32x4_t in){ // N:ok
// AB CD -> CD AB // AB CD -> CD AB
@ -387,9 +385,9 @@ namespace Optimization {
return in; return in;
}; };
}; };
struct Rotate{ struct Rotate{
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
switch(n){ switch(n){
@ -423,9 +421,9 @@ namespace Optimization {
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
}; };
struct PrecisionChange { struct PrecisionChange {
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
float16x4_t h = vcvt_f16_f32(a); float16x4_t h = vcvt_f16_f32(a);
@ -464,12 +462,12 @@ namespace Optimization {
StoD(s1, a, b); StoD(s1, a, b);
StoD(s2, c, d); StoD(s2, c, d);
} }
}; };
////////////////////////////////////////////// //////////////////////////////////////////////
// Exchange support // Exchange support
struct Exchange{ struct Exchange{
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: ABEF // in1: ABCD -> out1: ABEF
// in2: EFGH -> out2: CDGH // in2: EFGH -> out2: CDGH
@ -518,82 +516,80 @@ namespace Optimization {
assert(0); assert(0);
return; return;
}; };
}; };
////////////////////////////////////////////// //////////////////////////////////////////////
// Some Template specialization // Some Template specialization
//Complex float Reduce //Complex float Reduce
template<> template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){ inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
float32x4_t v1; // two complex float32x4_t v1; // two complex
v1 = Optimization::Permute::Permute0(in); v1 = Optimization::Permute::Permute0(in);
v1 = vaddq_f32(v1,in); v1 = vaddq_f32(v1,in);
u128f conv; conv.v=v1; u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]); return Grid::ComplexF(conv.f[0],conv.f[1]);
} }
//Real float Reduce //Real float Reduce
template<> template<>
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){ inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
return vaddvq_f32(in); return vaddvq_f32(in);
} }
//Complex double Reduce //Complex double Reduce
template<> template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
u128d conv; conv.v = in; u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]); return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
return vaddvq_f64(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
return vaddvq_u32(in);
}
} }
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
return vaddvq_f64(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
return vaddvq_u32(in);
}
NAMESPACE_END(Optimization);
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Here assign types // Here assign types
// typedef Optimization::vech SIMD_Htype; // Reduced precision type // typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef float16x8_t SIMD_Htype; // Half precision type typedef float16x8_t SIMD_Htype; // Half precision type
typedef float32x4_t SIMD_Ftype; // Single precision type typedef float32x4_t SIMD_Ftype; // Single precision type
typedef float64x2_t SIMD_Dtype; // Double precision type typedef float64x2_t SIMD_Dtype; // Double precision type
typedef uint32x4_t SIMD_Itype; // Integer type typedef uint32x4_t SIMD_Itype; // Integer type
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
inline void prefetch_HINT_T0(const char *ptr){}; inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD; typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD; typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>; template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
NAMESPACE_END(Grid);
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}