mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
NAMESPACE
This commit is contained in:
parent
6ab744c720
commit
ec89714cce
@ -25,8 +25,8 @@
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
/*
|
||||
|
||||
@ -45,29 +45,29 @@
|
||||
#include "Grid_generic_types.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace Grid {
|
||||
namespace Optimization {
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
NAMESPACE_BEGIN(Optimization);
|
||||
|
||||
template<class vtype>
|
||||
union uconv {
|
||||
template<class vtype>
|
||||
union uconv {
|
||||
float32x4_t f;
|
||||
vtype v;
|
||||
};
|
||||
union u128f {
|
||||
};
|
||||
union u128f {
|
||||
float32x4_t v;
|
||||
float f[4];
|
||||
};
|
||||
union u128d {
|
||||
};
|
||||
union u128d {
|
||||
float64x2_t v;
|
||||
double f[2];
|
||||
};
|
||||
// half precision
|
||||
union u128h {
|
||||
};
|
||||
// half precision
|
||||
union u128h {
|
||||
float16x8_t v;
|
||||
uint16_t f[8];
|
||||
};
|
||||
};
|
||||
|
||||
struct Vsplat{
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline float32x4_t operator()(float a, float b){
|
||||
float tmp[4]={a,b,a,b};
|
||||
@ -90,9 +90,9 @@ namespace Optimization {
|
||||
inline uint32x4_t operator()(Integer a){
|
||||
return vdupq_n_u32(a);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(float32x4_t a, float* F){
|
||||
vst1q_f32(F, a);
|
||||
@ -106,9 +106,9 @@ namespace Optimization {
|
||||
vst1q_u32(I, a);
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
|
||||
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
|
||||
//Float // N:generic
|
||||
inline void operator()(float * a, float32x4_t b){
|
||||
memcpy(a,&b,4*sizeof(float));
|
||||
@ -117,13 +117,11 @@ namespace Optimization {
|
||||
inline void operator()(double * a, float64x2_t b){
|
||||
memcpy(a,&b,2*sizeof(double));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
};
|
||||
|
||||
// Nils: Vset untested; not used currently in Grid at all;
|
||||
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
|
||||
struct Vset{
|
||||
// Nils: Vset untested; not used currently in Grid at all;
|
||||
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline float32x4_t operator()(Grid::ComplexF *a){
|
||||
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
|
||||
@ -148,10 +146,10 @@ namespace Optimization {
|
||||
inline uint32x4_t operator()(Integer *a){
|
||||
return vld1q_dup_u32(a);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
@ -159,12 +157,12 @@ namespace Optimization {
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vaddq_f32(a,b);
|
||||
@ -177,9 +175,9 @@ namespace Optimization {
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return vaddq_u32(a,b);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vsubq_f32(a,b);
|
||||
@ -192,9 +190,9 @@ namespace Optimization {
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return vsubq_u32(a,b);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
struct MultRealPart{
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
float32x4_t re = vtrn1q_f32(a, a);
|
||||
return vmulq_f32(re, b);
|
||||
@ -203,9 +201,9 @@ namespace Optimization {
|
||||
float64x2_t re = vzip1q_f64(a, a);
|
||||
return vmulq_f64(re, b);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct MaddRealPart{
|
||||
struct MaddRealPart{
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||
float32x4_t re = vtrn1q_f32(a, a);
|
||||
return vfmaq_f32(c, re, b);
|
||||
@ -214,9 +212,9 @@ namespace Optimization {
|
||||
float64x2_t re = vzip1q_f64(a, a);
|
||||
return vfmaq_f64(c, re, b);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct Div{
|
||||
struct Div{
|
||||
// Real float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
return vdivq_f32(a, b);
|
||||
@ -225,9 +223,9 @@ namespace Optimization {
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
return vdivq_f64(a, b);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
|
||||
|
||||
@ -275,9 +273,9 @@ namespace Optimization {
|
||||
// r5 = vmulq_f64(r0, a);
|
||||
// return vaddq_f64(r4, r5);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
|
||||
//return vaddq_f32(vmulq_f32(b,c),a);
|
||||
@ -298,9 +296,9 @@ namespace Optimization {
|
||||
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
|
||||
return vmulq_u32(a,b);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct Conj{
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline float32x4_t operator()(float32x4_t in){
|
||||
// ar ai br bi -> ar -ai br -bi
|
||||
@ -318,9 +316,9 @@ namespace Optimization {
|
||||
return vextq_f64(r0, r1, 1); // ar -ai
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
// ar ai br bi -> ai -ar ai -br
|
||||
@ -336,9 +334,9 @@ namespace Optimization {
|
||||
tmp = vnegq_f64(in);
|
||||
return vextq_f64(in, tmp, 1);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
// ar ai br bi -> -ai ar -bi br
|
||||
@ -354,9 +352,9 @@ namespace Optimization {
|
||||
tmp = vnegq_f64(in);
|
||||
return vextq_f64(tmp, in, 1);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
struct Permute{
|
||||
|
||||
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
|
||||
// AB CD -> CD AB
|
||||
@ -387,9 +385,9 @@ namespace Optimization {
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
struct Rotate{
|
||||
|
||||
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
|
||||
switch(n){
|
||||
@ -423,9 +421,9 @@ namespace Optimization {
|
||||
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
|
||||
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
struct PrecisionChange {
|
||||
struct PrecisionChange {
|
||||
|
||||
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
|
||||
float16x4_t h = vcvt_f16_f32(a);
|
||||
@ -464,12 +462,12 @@ namespace Optimization {
|
||||
StoD(s1, a, b);
|
||||
StoD(s2, c, d);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Exchange support
|
||||
//////////////////////////////////////////////
|
||||
// Exchange support
|
||||
|
||||
struct Exchange{
|
||||
struct Exchange{
|
||||
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
|
||||
// in1: ABCD -> out1: ABEF
|
||||
// in2: EFGH -> out2: CDGH
|
||||
@ -518,82 +516,80 @@ namespace Optimization {
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
|
||||
float32x4_t v1; // two complex
|
||||
v1 = Optimization::Permute::Permute0(in);
|
||||
v1 = vaddq_f32(v1,in);
|
||||
u128f conv; conv.v=v1;
|
||||
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
|
||||
return vaddvq_f32(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||
u128d conv; conv.v = in;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
||||
return vaddvq_f64(in);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
|
||||
return vaddvq_u32(in);
|
||||
}
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
|
||||
return vaddvq_f64(in);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
|
||||
return vaddvq_u32(in);
|
||||
}
|
||||
|
||||
NAMESPACE_END(Optimization);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
|
||||
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
|
||||
typedef float16x8_t SIMD_Htype; // Half precision type
|
||||
typedef float32x4_t SIMD_Ftype; // Single precision type
|
||||
typedef float64x2_t SIMD_Dtype; // Double precision type
|
||||
typedef uint32x4_t SIMD_Itype; // Integer type
|
||||
typedef float16x8_t SIMD_Htype; // Half precision type
|
||||
typedef float32x4_t SIMD_Ftype; // Single precision type
|
||||
typedef float64x2_t SIMD_Dtype; // Double precision type
|
||||
typedef uint32x4_t SIMD_Itype; // Integer type
|
||||
|
||||
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
|
||||
inline void prefetch_HINT_T0(const char *ptr){};
|
||||
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities
|
||||
inline void prefetch_HINT_T0(const char *ptr){};
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user