1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-10 11:26:56 +01:00

More NEON functionalities

This commit is contained in:
neo
2015-07-21 11:52:15 +09:00
parent 97afe4125f
commit 9adaeb061a
10 changed files with 88 additions and 87 deletions

View File

@ -1,14 +1,16 @@
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries for NEON (ARM) instructions set ARMv7
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
Experimental - Using intrinsics - DEVELOPING!
*/
// Time-stamp: <2015-06-09 15:25:40 neo>
// Time-stamp: <2015-07-10 17:45:09 neo>
//----------------------------------------------------------------------
#include <arm_neon.h>
// ARMv8 supports double precision
namespace Optimization {
template<class vtype>
@ -22,50 +24,47 @@ namespace Optimization {
float f[4];
};
union u128d {
float32x4_t v;
float f[4];
float64x2_t v;
double f[4];
};
struct Vsplat{
//Complex float
inline float32x4_t operator()(float a, float b){
float32x4_t foo;
return foo;
float tmp[4]={a,b,a,b};
return vld1q_f32(tmp);
}
// Real float
inline float32x4_t operator()(float a){
float32x4_t foo;
return foo;
return vld1q_dup_f32(&a);
}
//Complex double
inline float32x4_t operator()(double a, double b){
float32x4_t foo;
return foo;
float tmp[4]={(float)a,(float)b,(float)a,(float)b};
return vld1q_f32(tmp);
}
//Real double
inline float32x4_t operator()(double a){
float32x4_t foo;
return foo;
return vld1q_dup_f32(&a);
}
//Integer
inline uint32x4_t operator()(Integer a){
uint32x4_t foo;
return foo;
return vld1q_dup_u32(&a);
}
};
struct Vstore{
//Float
inline void operator()(float32x4_t a, float* F){
vst1q_f32(F, a);
}
//Double
inline void operator()(float32x4_t a, double* D){
vst1q_f32((float*)D, a);
}
//Integer
inline void operator()(uint32x4_t a, Integer* I){
vst1q_u32(I, a);
}
};
@ -130,36 +129,30 @@ namespace Optimization {
struct Sum{
//Complex/Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t foo;
return foo;
return vaddq_f32(a,b);
}
//Complex/Real double
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
// float32x4_t foo;
// return foo;
//}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vaddq_f64(a,b);
}
//Integer
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
uint32x4_t foo;
return foo;
return vaddq_u32(a,b);
}
};
struct Sub{
//Complex/Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t foo;
return foo;
return vsubq_f32(a,b);
}
//Complex/Real double
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
// float32x4_t foo;
// return foo;
//}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vsubq_f64(a,b);
}
//Integer
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
uint32x4_t foo;
return foo;
return vsubq_u32(a,b);
}
};
@ -170,24 +163,24 @@ namespace Optimization {
return foo;
}
// Complex double
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
// float32x4_t foo;
// return foo;
//}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float32x4_t foo;
return foo;
}
};
struct Mult{
// Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return a;
return vmulq_f32(a,b);
}
// Real double
//inline float32x4_t operator()(float32x4_t a, float32x4_t b){
// return 0;
//}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vmulq_f64(a,b);
}
// Integer
inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
return a;
return vmulq_u32(a,b);
}
};
@ -219,6 +212,7 @@ namespace Optimization {
struct TimesI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
//need shuffle
return in;
}
//Complex double
@ -242,20 +236,25 @@ namespace Optimization {
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
return 0;
float32x2_t high = vget_high_f32(in);
float32x2_t low = vget_low_f32(in);
float32x2_t tmp = vadd_f32(low, high);
float32x2_t sum = vpadd_f32(tmp, tmp);
return vget_lane_f32(sum,0);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
return 0;
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
return 0;
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
float64x2_t sum = vpaddq_f64(in, in);
return vgetq_lane_f64(sum,0);
}
//Integer Reduce
@ -272,7 +271,7 @@ namespace Optimization {
namespace Grid {
typedef float32x4_t SIMD_Ftype; // Single precision type
typedef float32x4_t SIMD_Dtype; // Double precision type - no double on ARMv7
typedef float64x2_t SIMD_Dtype; // Double precision type
typedef uint32x4_t SIMD_Itype; // Integer type
inline void v_prefetch0(int size, const char *ptr){}; // prefetch utilities

View File

@ -2,7 +2,7 @@
/*! @file Grid_vector_types.h
@brief Defines templated class Grid_simd to deal with inner vector types
*/
// Time-stamp: <2015-06-09 15:00:47 neo>
// Time-stamp: <2015-07-10 17:45:33 neo>
//---------------------------------------------------------------------------
#ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES
@ -22,7 +22,7 @@
#if defined QPX
#include "Grid_qpx.h"
#endif
#ifdef NEONv7
#ifdef NEONv8
#include "Grid_neon.h"
#endif