1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Added integer reduce functionality

This commit is contained in:
Nils Meyer 2017-07-24 11:12:59 +02:00
parent 0933aeefd4
commit 7a53dc3715

View File

@ -6,9 +6,9 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Nils Meyer <nils.meyer@ur.de> Author: Nils Meyer <nils.meyer@ur.de>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp> Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -27,7 +27,7 @@ Author: neo <cossu@post.kek.jp>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
//----------------------------------------------------------------------
/* /*
ARMv8 NEON intrinsics layer by ARMv8 NEON intrinsics layer by
@ -37,9 +37,6 @@ Author: neo <cossu@post.kek.jp>
SFB/TRR55 SFB/TRR55
*/ */
//----------------------------------------------------------------------
//#ifndef ARM_NEON
//#define ARM_NEON
#ifndef GEN_SIMD_WIDTH #ifndef GEN_SIMD_WIDTH
#define GEN_SIMD_WIDTH 16u #define GEN_SIMD_WIDTH 16u
@ -85,11 +82,11 @@ namespace Optimization {
double tmp[2]={a,b}; double tmp[2]={a,b};
return vld1q_f64(tmp); return vld1q_f64(tmp);
} }
//Real double // N:tbc //Real double
inline float64x2_t operator()(double a){ inline float64x2_t operator()(double a){
return vdupq_n_f64(a); return vdupq_n_f64(a);
} }
//Integer // N:tbc //Integer
inline uint32x4_t operator()(Integer a){ inline uint32x4_t operator()(Integer a){
return vdupq_n_u32(a); return vdupq_n_u32(a);
} }
@ -127,33 +124,32 @@ namespace Optimization {
// Nils: Vset untested; not used currently in Grid at all; // Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{ struct Vset{
// Complex float // N:ok // Complex float
inline float32x4_t operator()(Grid::ComplexF *a){ inline float32x4_t operator()(Grid::ComplexF *a){
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
return vld1q_f32(tmp); return vld1q_f32(tmp);
} }
// Complex double // N:ok // Complex double
inline float64x2_t operator()(Grid::ComplexD *a){ inline float64x2_t operator()(Grid::ComplexD *a){
double tmp[2]={a[0].imag(),a[0].real()}; double tmp[2]={a[0].imag(),a[0].real()};
return vld1q_f64(tmp); return vld1q_f64(tmp);
} }
// Real float // N:ok // Real float
inline float32x4_t operator()(float *a){ inline float32x4_t operator()(float *a){
float tmp[4]={a[3],a[2],a[1],a[0]}; float tmp[4]={a[3],a[2],a[1],a[0]};
return vld1q_f32(tmp); return vld1q_f32(tmp);
} }
// Real double // N:ok // Real double
inline float64x2_t operator()(double *a){ inline float64x2_t operator()(double *a){
double tmp[2]={a[1],a[0]}; double tmp[2]={a[1],a[0]};
return vld1q_f64(tmp); return vld1q_f64(tmp);
} }
// Integer // N:ok // Integer
inline uint32x4_t operator()(Integer *a){ inline uint32x4_t operator()(Integer *a){
return vld1q_dup_u32(a); return vld1q_dup_u32(a);
} }
}; };
// N:leaving as is
template <typename Out_type, typename In_type> template <typename Out_type, typename In_type>
struct Reduce{ struct Reduce{
//Need templated class to overload output type //Need templated class to overload output type
@ -252,9 +248,9 @@ namespace Optimization {
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
// no fma, use mul and add // no fma, use mul and add
//float32x4_t r5; // float32x4_t r5;
//r5 = vmulq_f32(r0, a); // r5 = vmulq_f32(r0, a);
//return vaddq_f32(r4, r5); // return vaddq_f32(r4, r5);
} }
// Complex double // Complex double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){ inline float64x2_t operator()(float64x2_t a, float64x2_t b){
@ -275,9 +271,9 @@ namespace Optimization {
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
// no fma, use mul and add // no fma, use mul and add
//float64x2_t r5; // float64x2_t r5;
//r5 = vmulq_f64(r0, a); // r5 = vmulq_f64(r0, a);
//return vaddq_f64(r4, r5); // return vaddq_f64(r4, r5);
} }
}; };
@ -424,11 +420,6 @@ namespace Optimization {
} }
} }
// working, but no restriction on n
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
// restriction on n
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
@ -444,7 +435,7 @@ namespace Optimization {
sb = vcvt_high_f32_f16(h); sb = vcvt_high_f32_f16(h);
// there is no direct conversion from lower float32x4_t to float64x2_t // there is no direct conversion from lower float32x4_t to float64x2_t
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
// workaround for clang // workaround for clang
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h); uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2)); float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
@ -550,7 +541,7 @@ namespace Optimization {
//Complex double Reduce //Complex double Reduce
template<> // N:by Boyle template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
u128d conv; conv.v = in; u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]); return Grid::ComplexD(conv.f[0],conv.f[1]);
@ -565,9 +556,7 @@ namespace Optimization {
//Integer Reduce //Integer Reduce
template<> template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
// FIXME unimplemented return vaddvq_u32(in);
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
} }
} }
@ -607,5 +596,3 @@ namespace Optimization {
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;
} }
//#endif // ARM_NEON