More NEON functionalities

2026-02-03 13:53:29 +00:00 · 2015-07-21 11:52:15 +09:00
parent 97afe4125f
commit 9adaeb061a
10 changed files with 88 additions and 87 deletions
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,14 +1,16 @@
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
+  @brief Optimization libraries for NEON (ARM) instructions set ARMv8

  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-06-09 15:25:40 neo>
+// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------

 #include <arm_neon.h>

+// ARMv8 supports double precision
+
 namespace Optimization {

  template<class vtype>
@@ -22,50 +24,47 @@ namespace Optimization {
    float f[4];
  };
  union u128d {
-    float32x4_t v;
-    float f[4];
+    float64x2_t v;
+    double f[4];
  };
  
  struct Vsplat{
    //Complex float
    inline float32x4_t operator()(float a, float b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a,b,a,b};
+      return vld1q_f32(tmp);
    }
    // Real float
    inline float32x4_t operator()(float a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
    }
    //Complex double
    inline float32x4_t operator()(double a, double b){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
+      return vld1q_f32(tmp);
    }
    //Real double
    inline float32x4_t operator()(double a){
-      float32x4_t foo;
-      return foo;
+      return vld1q_dup_f32(&a);
    }
    //Integer
    inline uint32x4_t operator()(Integer a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(&a);
    }
  };

  struct Vstore{
    //Float 
    inline void operator()(float32x4_t a, float* F){
-      
+      vst1q_f32(F, a);
    }
    //Double
    inline void operator()(float32x4_t a, double* D){
-      
+      vst1q_f32((float*)D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
-     
+      vst1q_u32(I, a);
    }

  };
@@ -130,36 +129,30 @@ namespace Optimization {
  struct Sum{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vaddq_f32(a,b);
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vaddq_f64(a,b);
+    }
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vaddq_u32(a,b);
    }
  };

  struct Sub{
    //Complex/Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+      return vsubq_f32(a,b);
    }
    //Complex/Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vsubq_f64(a,b);
+    }
    //Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      uint32x4_t foo;
-      return foo;
+      return vsubq_u32(a,b);
    }
  };

@@ -170,24 +163,24 @@ namespace Optimization {
      return foo;
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  float32x4_t foo;
-    //  return foo;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float32x4_t foo;
+      return foo;
+    }
  };

  struct Mult{
    // Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      return a;
+      return vmulq_f32(a,b);
    }
    // Real double
-    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vmulq_f64(a,b);
+    }
    // Integer
    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
-      return a;
+      return vmulq_u32(a,b);
    }
  };

@@ -219,6 +212,7 @@ namespace Optimization {
  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      //need shuffle
      return in;
    }
    //Complex double
@@ -242,20 +236,25 @@ namespace Optimization {
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x2_t high = vget_high_f32(in);
+    float32x2_t low = vget_low_f32(in);
+    float32x2_t tmp = vadd_f32(low, high);
+    float32x2_t sum = vpadd_f32(tmp, tmp);
+    return vget_lane_f32(sum,0);
  }
  
  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
    return 0;
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
+    float64x2_t sum = vpaddq_f64(in, in);
+    return vgetq_lane_f64(sum,0);
  }

  //Integer Reduce
@@ -272,7 +271,7 @@ namespace Optimization {
 namespace Grid {

  typedef float32x4_t  SIMD_Ftype; // Single precision type
-  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
+  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type

  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,7 +2,7 @@
 /*! @file Grid_vector_types.h
  @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-06-09 15:00:47 neo>
+// Time-stamp: <2015-07-10 17:45:33 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
@@ -22,7 +22,7 @@
 #if defined QPX
 #include "Grid_qpx.h"
 #endif
-#ifdef NEONv7
+#ifdef NEONv8
 #include "Grid_neon.h"
 #endif