NAMESPACE

2025-11-24 00:19:32 +00:00 · 2018-01-12 18:24:16 +00:00
parent 6ab744c720
commit ec89714cce
1 changed files with 528 additions and 532 deletions
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -25,8 +25,8 @@
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 /*

@@ -45,29 +45,29 @@
 #include "Grid_generic_types.h"
 #include <arm_neon.h>

-namespace Grid {
-namespace Optimization {
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Optimization);

-  template<class vtype>
-  union uconv {
+template<class vtype>
+union uconv {
  float32x4_t f;
  vtype v;
-  };
-  union u128f {
+};
+union u128f {
  float32x4_t v;
  float f[4];
-  };
-  union u128d {
+};
+union u128d {
  float64x2_t v;
  double f[2];
-  };
-  // half precision
-  union u128h {
+};
+// half precision
+union u128h {
  float16x8_t v;
  uint16_t f[8];
-  };
+};

-  struct Vsplat{
+struct Vsplat{
  //Complex float
  inline float32x4_t operator()(float a, float b){
    float tmp[4]={a,b,a,b};
@@ -90,9 +90,9 @@ namespace Optimization {
  inline uint32x4_t operator()(Integer a){
    return vdupq_n_u32(a);
  }
-  };
+};

-  struct Vstore{
+struct Vstore{
  //Float
  inline void operator()(float32x4_t a, float* F){
    vst1q_f32(F, a);
@@ -106,9 +106,9 @@ namespace Optimization {
    vst1q_u32(I, a);
  }

-  };
+};

-  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
  //Float // N:generic
  inline void operator()(float * a, float32x4_t b){
    memcpy(a,&b,4*sizeof(float));
@@ -117,13 +117,11 @@ namespace Optimization {
  inline void operator()(double * a, float64x2_t b){
    memcpy(a,&b,2*sizeof(double));
  }
+};

-
-  };
-
-  // Nils: Vset untested; not used currently in Grid at all;
-  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
-  struct Vset{
+// Nils: Vset untested; not used currently in Grid at all;
+// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
+struct Vset{
  // Complex float
  inline float32x4_t operator()(Grid::ComplexF *a){
    float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
@@ -148,10 +146,10 @@ namespace Optimization {
  inline uint32x4_t operator()(Integer *a){
    return vld1q_dup_u32(a);
  }
-  };
+};

-  template <typename Out_type, typename In_type>
-  struct Reduce{
+template <typename Out_type, typename In_type>
+struct Reduce{
  //Need templated class to overload output type
  //General form must generate error if compiled
  inline Out_type operator()(In_type in){
@@ -159,12 +157,12 @@ namespace Optimization {
    exit(1);
    return 0;
  }
-  };
+};

-  /////////////////////////////////////////////////////
-  // Arithmetic operations
-  /////////////////////////////////////////////////////
-  struct Sum{
+/////////////////////////////////////////////////////
+// Arithmetic operations
+/////////////////////////////////////////////////////
+struct Sum{
  //Complex/Real float
  inline float32x4_t operator()(float32x4_t a, float32x4_t b){
    return vaddq_f32(a,b);
@@ -177,9 +175,9 @@ namespace Optimization {
  inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
    return vaddq_u32(a,b);
  }
-  };
+};

-  struct Sub{
+struct Sub{
  //Complex/Real float
  inline float32x4_t operator()(float32x4_t a, float32x4_t b){
    return vsubq_f32(a,b);
@@ -192,9 +190,9 @@ namespace Optimization {
  inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
    return vsubq_u32(a,b);
  }
-  };
+};

-  struct MultRealPart{
+struct MultRealPart{
  inline float32x4_t operator()(float32x4_t a, float32x4_t b){
    float32x4_t re = vtrn1q_f32(a, a);
    return vmulq_f32(re, b);
@@ -203,9 +201,9 @@ namespace Optimization {
    float64x2_t re = vzip1q_f64(a, a);
    return vmulq_f64(re, b);
  }
-  };
+};

-  struct MaddRealPart{
+struct MaddRealPart{
  inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
    float32x4_t re = vtrn1q_f32(a, a);
    return vfmaq_f32(c, re, b);
@@ -214,9 +212,9 @@ namespace Optimization {
    float64x2_t re = vzip1q_f64(a, a);
    return vfmaq_f64(c, re, b);
  }
-  };
+};

-  struct Div{
+struct Div{
  // Real float
  inline float32x4_t operator()(float32x4_t a, float32x4_t b){
    return vdivq_f32(a, b);
@@ -225,9 +223,9 @@ namespace Optimization {
  inline float64x2_t operator()(float64x2_t a, float64x2_t b){
    return vdivq_f64(a, b);
  }
-  };
+};

-  struct MultComplex{
+struct MultComplex{
  // Complex float
  inline float32x4_t operator()(float32x4_t a, float32x4_t b){

@@ -275,9 +273,9 @@ namespace Optimization {
    // r5 = vmulq_f64(r0, a);
    // return vaddq_f64(r4, r5);
  }
-  };
+};

-  struct Mult{
+struct Mult{
  // Real float
  inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
    //return vaddq_f32(vmulq_f32(b,c),a);
@@ -298,9 +296,9 @@ namespace Optimization {
  inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
    return vmulq_u32(a,b);
  }
-  };
+};

-  struct Conj{
+struct Conj{
  // Complex single
  inline float32x4_t operator()(float32x4_t in){
    // ar ai br bi -> ar -ai br -bi
@@ -318,9 +316,9 @@ namespace Optimization {
    return vextq_f64(r0, r1, 1);  //  ar -ai
  }
  // do not define for integer input
-  };
+};

-  struct TimesMinusI{
+struct TimesMinusI{
  //Complex single
  inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
    // ar ai br bi -> ai -ar ai -br
@@ -336,9 +334,9 @@ namespace Optimization {
    tmp = vnegq_f64(in);
    return vextq_f64(in, tmp, 1);
  }
-  };
+};

-  struct TimesI{
+struct TimesI{
  //Complex single
  inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
    // ar ai br bi -> -ai ar -bi br
@@ -354,9 +352,9 @@ namespace Optimization {
    tmp = vnegq_f64(in);
    return vextq_f64(tmp, in, 1);
  }
-  };
+};

-  struct Permute{
+struct Permute{

  static inline float32x4_t Permute0(float32x4_t in){ // N:ok
    // AB CD -> CD AB
@@ -387,9 +385,9 @@ namespace Optimization {
    return in;
  };

-  };
+};

-  struct Rotate{
+struct Rotate{

  static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
    switch(n){
@@ -423,9 +421,9 @@ namespace Optimization {
  template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
  template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };

-  };
+};

-  struct PrecisionChange {
+struct PrecisionChange {

  static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
    float16x4_t h = vcvt_f16_f32(a);
@@ -464,12 +462,12 @@ namespace Optimization {
    StoD(s1, a, b);
    StoD(s2, c, d);
  }
-  };
+};

-  //////////////////////////////////////////////
-  // Exchange support
+//////////////////////////////////////////////
+// Exchange support

-  struct Exchange{
+struct Exchange{
  static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
    // in1: ABCD -> out1: ABEF
    // in2: EFGH -> out2: CDGH
@@ -518,82 +516,80 @@ namespace Optimization {
    assert(0);
    return;
  };
-  };
+};

-  //////////////////////////////////////////////
-  // Some Template specialization
+//////////////////////////////////////////////
+// Some Template specialization


-  //Complex float Reduce
-  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
+//Complex float Reduce
+template<>
+inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
  float32x4_t v1; // two complex
  v1 = Optimization::Permute::Permute0(in);
  v1 = vaddq_f32(v1,in);
  u128f conv;    conv.v=v1;
  return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
-  //Real float Reduce
-  template<>
-  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
+}
+//Real float Reduce
+template<>
+inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
  return vaddvq_f32(in);
-  }
+}


-  //Complex double Reduce
-  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
+//Complex double Reduce
+template<>
+inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
  u128d conv; conv.v = in;
  return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
-
-  //Real double Reduce
-  template<>
-  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    return vaddvq_f64(in);
-  }
-
-  //Integer Reduce
-  template<>
-  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
-    return vaddvq_u32(in);
-  }
 }

+//Real double Reduce
+template<>
+inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
+  return vaddvq_f64(in);
+}
+
+//Integer Reduce
+template<>
+inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
+  return vaddvq_u32(in);
+}
+
+NAMESPACE_END(Optimization);
+
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types

 // typedef Optimization::vech SIMD_Htype; // Reduced precision type
-  typedef float16x8_t  SIMD_Htype; // Half precision type
-  typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float64x2_t  SIMD_Dtype; // Double precision type
-  typedef uint32x4_t   SIMD_Itype; // Integer type
+typedef float16x8_t  SIMD_Htype; // Half precision type
+typedef float32x4_t  SIMD_Ftype; // Single precision type
+typedef float64x2_t  SIMD_Dtype; // Double precision type
+typedef uint32x4_t   SIMD_Itype; // Integer type

-  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
-  inline void prefetch_HINT_T0(const char *ptr){};
+inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
+inline void prefetch_HINT_T0(const char *ptr){};


-  // Function name aliases
-  typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+// Function name aliases
+typedef Optimization::Vsplat   VsplatSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
+typedef Optimization::Vset     VsetSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

+// Arithmetic operations
+typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Div         DivSIMD;
+typedef Optimization::Mult        MultSIMD;
+typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::MultRealPart MultRealPartSIMD;
+typedef Optimization::MaddRealPart MaddRealPartSIMD;
+typedef Optimization::Conj        ConjSIMD;
+typedef Optimization::TimesMinusI TimesMinusISIMD;
+typedef Optimization::TimesI      TimesISIMD;

-
-
-  // Arithmetic operations
-  typedef Optimization::Sum         SumSIMD;
-  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Div         DivSIMD;
-  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
-  typedef Optimization::MultRealPart MultRealPartSIMD;
-  typedef Optimization::MaddRealPart MaddRealPartSIMD;
-  typedef Optimization::Conj        ConjSIMD;
-  typedef Optimization::TimesMinusI TimesMinusISIMD;
-  typedef Optimization::TimesI      TimesISIMD;
-
-}
+NAMESPACE_END(Grid);