From 7a53dc3715fbf2606aa1eb8ce943be35367e8ebb Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Mon, 24 Jul 2017 11:12:59 +0200
Subject: [PATCH] Added integer reduce functionality

---
 lib/simd/Grid_neon.h | 53 +++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 33 deletions(-)
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 38815389..cadb4df8 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -6,9 +6,9 @@
 
     Copyright (C) 2015
 
-Author: Nils Meyer <nils.meyer@ur.de>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -27,7 +27,7 @@ Author: neo <cossu@post.kek.jp>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-//----------------------------------------------------------------------
+
 /*
 
   ARMv8 NEON intrinsics layer by
@@ -37,9 +37,6 @@ Author: neo <cossu@post.kek.jp>
   SFB/TRR55
 
 */
-//----------------------------------------------------------------------
-//#ifndef ARM_NEON
-//#define ARM_NEON
 
 #ifndef GEN_SIMD_WIDTH
 #define GEN_SIMD_WIDTH 16u
@@ -85,11 +82,11 @@ namespace Optimization {
       double tmp[2]={a,b};
       return vld1q_f64(tmp);
     }
-    //Real double // N:tbc
+    //Real double
     inline float64x2_t operator()(double a){
       return vdupq_n_f64(a);
     }
-    //Integer // N:tbc
+    //Integer
     inline uint32x4_t operator()(Integer a){
       return vdupq_n_u32(a);
     }
@@ -127,33 +124,32 @@ namespace Optimization {
   // Nils: Vset untested; not used currently in Grid at all;
   // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
   struct Vset{
-    // Complex float // N:ok
+    // Complex float
     inline float32x4_t operator()(Grid::ComplexF *a){
       float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
       return vld1q_f32(tmp);
     }
-    // Complex double // N:ok
+    // Complex double
     inline float64x2_t operator()(Grid::ComplexD *a){
       double tmp[2]={a[0].imag(),a[0].real()};
       return vld1q_f64(tmp);
     }
-    // Real float // N:ok
+    // Real float
     inline float32x4_t operator()(float *a){
       float tmp[4]={a[3],a[2],a[1],a[0]};
       return vld1q_f32(tmp);
     }
-    // Real double // N:ok
+    // Real double
     inline float64x2_t operator()(double *a){
       double tmp[2]={a[1],a[0]};
       return vld1q_f64(tmp);
     }
-    // Integer // N:ok
+    // Integer
     inline uint32x4_t operator()(Integer *a){
       return vld1q_dup_u32(a);
     }
   };
 
-  // N:leaving as is
   template <typename Out_type, typename In_type>
   struct Reduce{
     //Need templated class to overload output type
@@ -252,9 +248,9 @@ namespace Optimization {
       return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
 
       // no fma, use mul and add
-      //float32x4_t r5;
-      //r5 = vmulq_f32(r0, a);
-      //return vaddq_f32(r4, r5);
+      // float32x4_t r5;
+      // r5 = vmulq_f32(r0, a);
+      // return vaddq_f32(r4, r5);
     }
     // Complex double
     inline float64x2_t operator()(float64x2_t a, float64x2_t b){
@@ -275,9 +271,9 @@ namespace Optimization {
       return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
 
       // no fma, use mul and add
-      //float64x2_t r5;
-      //r5 = vmulq_f64(r0, a);
-      //return vaddq_f64(r4, r5);
+      // float64x2_t r5;
+      // r5 = vmulq_f64(r0, a);
+      // return vaddq_f64(r4, r5);
     }
   };
 
@@ -424,11 +420,6 @@ namespace Optimization {
       }
     }
 
-// working, but no restriction on n
-//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
-//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
-
-// restriction on n
     template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
     template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
 
@@ -444,7 +435,7 @@ namespace Optimization {
       sb = vcvt_high_f32_f16(h);
       // there is no direct conversion from lower float32x4_t to float64x2_t
       // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
-      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
       // workaround for clang
       uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
       float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
@@ -550,7 +541,7 @@ namespace Optimization {
 
 
   //Complex double Reduce
-  template<> // N:by Boyle
+  template<>
   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
     u128d conv; conv.v = in;
     return Grid::ComplexD(conv.f[0],conv.f[1]);
@@ -565,9 +556,7 @@ namespace Optimization {
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return vaddvq_u32(in);
   }
 }
 
@@ -607,5 +596,3 @@ namespace Optimization {
   typedef Optimization::TimesI      TimesISIMD;
 
 }
-
-//#endif // ARM_NEON