From 7a53dc3715fbf2606aa1eb8ce943be35367e8ebb Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Mon, 24 Jul 2017 11:12:59 +0200 Subject: [PATCH] Added integer reduce functionality --- lib/simd/Grid_neon.h | 53 +++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h index 38815389..cadb4df8 100644 --- a/lib/simd/Grid_neon.h +++ b/lib/simd/Grid_neon.h @@ -6,9 +6,9 @@ Copyright (C) 2015 -Author: Nils Meyer -Author: Peter Boyle -Author: neo + Author: Nils Meyer + Author: Peter Boyle + Author: neo This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ Author: neo See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -//---------------------------------------------------------------------- + /* ARMv8 NEON intrinsics layer by @@ -37,9 +37,6 @@ Author: neo SFB/TRR55 */ -//---------------------------------------------------------------------- -//#ifndef ARM_NEON -//#define ARM_NEON #ifndef GEN_SIMD_WIDTH #define GEN_SIMD_WIDTH 16u @@ -85,11 +82,11 @@ namespace Optimization { double tmp[2]={a,b}; return vld1q_f64(tmp); } - //Real double // N:tbc + //Real double inline float64x2_t operator()(double a){ return vdupq_n_f64(a); } - //Integer // N:tbc + //Integer inline uint32x4_t operator()(Integer a){ return vdupq_n_u32(a); } @@ -127,33 +124,32 @@ namespace Optimization { // Nils: Vset untested; not used currently in Grid at all; // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b struct Vset{ - // Complex float // N:ok + // Complex float inline float32x4_t operator()(Grid::ComplexF *a){ float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; return vld1q_f32(tmp); } - // Complex double // N:ok + // Complex double inline float64x2_t operator()(Grid::ComplexD *a){ double tmp[2]={a[0].imag(),a[0].real()}; return vld1q_f64(tmp); } - // Real float // N:ok + // Real float inline float32x4_t operator()(float *a){ float tmp[4]={a[3],a[2],a[1],a[0]}; return vld1q_f32(tmp); } - // Real double // N:ok + // Real double inline float64x2_t operator()(double *a){ double tmp[2]={a[1],a[0]}; return vld1q_f64(tmp); } - // Integer // N:ok + // Integer inline uint32x4_t operator()(Integer *a){ return vld1q_dup_u32(a); } }; - // N:leaving as is template struct Reduce{ //Need templated class to overload output type @@ -252,9 +248,9 @@ namespace Optimization { return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... // no fma, use mul and add - //float32x4_t r5; - //r5 = vmulq_f32(r0, a); - //return vaddq_f32(r4, r5); + // float32x4_t r5; + // r5 = vmulq_f32(r0, a); + // return vaddq_f32(r4, r5); } // Complex double inline float64x2_t operator()(float64x2_t a, float64x2_t b){ @@ -275,9 +271,9 @@ namespace Optimization { return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi // no fma, use mul and add - //float64x2_t r5; - //r5 = vmulq_f64(r0, a); - //return vaddq_f64(r4, r5); + // float64x2_t r5; + // r5 = vmulq_f64(r0, a); + // return vaddq_f64(r4, r5); } }; @@ -424,11 +420,6 @@ namespace Optimization { } } -// working, but no restriction on n -// template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; -// template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; - -// restriction on n template static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; template static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; @@ -444,7 +435,7 @@ namespace Optimization { sb = vcvt_high_f32_f16(h); // there is no direct conversion from lower float32x4_t to float64x2_t // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang - //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang + // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang // workaround for clang uint32x4_t h1u = reinterpret_cast(h); float16x8_t h1 = reinterpret_cast(vextq_u32(h1u, h1u, 2)); @@ -550,7 +541,7 @@ namespace Optimization { //Complex double Reduce - template<> // N:by Boyle + template<> inline Grid::ComplexD Reduce::operator()(float64x2_t in){ u128d conv; conv.v = in; return Grid::ComplexD(conv.f[0],conv.f[1]); @@ -565,9 +556,7 @@ namespace Optimization { //Integer Reduce template<> inline Integer Reduce::operator()(uint32x4_t in){ - // FIXME unimplemented - printf("Reduce : Missing integer implementation -> FIX\n"); - assert(0); + return vaddvq_u32(in); } } @@ -607,5 +596,3 @@ namespace Optimization { typedef Optimization::TimesI TimesISIMD; } - -//#endif // ARM_NEON