From 3d04dc33c6c64f2ed761ef62094519a81668aaee Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 13 Jun 2017 13:26:59 +0200
Subject: [PATCH] ARM neon intrinsics support

---
 configure.ac                  |   3 +
 lib/simd/Grid_generic_types.h |   2 +-
 lib/simd/Grid_neon.h          | 460 +++++++++++++++++++++++++++-------
 lib/simd/Grid_vector_types.h  |  44 ++--
 4 files changed, 396 insertions(+), 113 deletions(-)
diff --git a/configure.ac b/configure.ac
index 62b7545b..20f71128 100644
--- a/configure.ac
+++ b/configure.ac
@@ -244,6 +244,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                            [generic SIMD vector width (in bytes)])
         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
         SIMD_FLAGS='';;
+      NEONv8)
+        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
+        SIMD_FLAGS='';;
       QPX|BGQ)
         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
         SIMD_FLAGS='';;
diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h
index 642f6ffe..eac65e09 100644
--- a/lib/simd/Grid_generic_types.h
+++ b/lib/simd/Grid_generic_types.h
@@ -26,7 +26,7 @@ Author: Antonin Portelli <antonin.portelli@me.com>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-
+#define GEN_SIMD_WIDTH 16
 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
 
 //#define VECTOR_LOOPS
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 7c1ad443..f3f802e7 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,11 +1,12 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
 
     Source file: ./lib/simd/Grid_neon.h
 
     Copyright (C) 2015
 
+Author: Nils Meyer <nils.meyer@ur.de>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 
@@ -27,18 +28,23 @@ Author: neo <cossu@post.kek.jp>
     *************************************************************************************/
     /*  END LEGAL */
 //----------------------------------------------------------------------
-/*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
+/*
+
+  ARMv8 NEON intrinsics layer by
+
+  Nils Meyer <nils.meyer@ur.de>,
+  University of Regensburg, Germany
+  SFB/TRR55
 
-  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
+//#ifndef ARM_NEON
+//#define ARM_NEON
 
+#include "Grid_generic_types.h"
 #include <arm_neon.h>
 
-// ARMv8 supports double precision
-
+namespace Grid {
 namespace Optimization {
 
   template<class vtype>
@@ -46,16 +52,20 @@ namespace Optimization {
     float32x4_t f;
     vtype v;
   };
-
   union u128f {
     float32x4_t v;
     float f[4];
   };
   union u128d {
     float64x2_t v;
-    double f[4];
+    double f[2];
   };
-  
+  // half precision
+  union u128h {
+    float16x8_t v;
+    uint16_t f[8];
+  };
+
   struct Vsplat{
     //Complex float
     inline float32x4_t operator()(float a, float b){
@@ -64,31 +74,31 @@ namespace Optimization {
     }
     // Real float
     inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
     }
     //Complex double
-    inline float32x4_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
-      return vld1q_f32(tmp);
+    inline float64x2_t operator()(double a, double b){
+      double tmp[2]={a,b};
+      return vld1q_f64(tmp);
     }
-    //Real double
-    inline float32x4_t operator()(double a){
-      return vld1q_dup_f32(&a);
+    //Real double // N:tbc
+    inline float64x2_t operator()(double a){
+      return vdupq_n_f64(a);
     }
-    //Integer
+    //Integer // N:tbc
     inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
     }
   };
 
   struct Vstore{
-    //Float 
+    //Float
     inline void operator()(float32x4_t a, float* F){
       vst1q_f32(F, a);
     }
     //Double
-    inline void operator()(float32x4_t a, double* D){
-      vst1q_f32((float*)D, a);
+    inline void operator()(float64x2_t a, double* D){
+      vst1q_f64(D, a);
     }
     //Integer
     inline void operator()(uint32x4_t a, Integer* I){
@@ -97,54 +107,54 @@ namespace Optimization {
 
   };
 
-  struct Vstream{
-    //Float
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+    //Float // N:generic
     inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
     }
-    //Double
-    inline void operator()(double * a, float32x4_t b){
-  
+    //Double // N:generic
+    inline void operator()(double * a, float64x2_t b){
+      memcpy(a,&b,2*sizeof(double));
     }
 
 
   };
 
+  // Nils: Vset untested; not used currently in Grid at all;
+  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
   struct Vset{
-    // Complex float 
+    // Complex float // N:ok
     inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
+      return vld1q_f32(tmp);
     }
-    // Complex double 
-    inline float32x4_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
-      return foo;
+    // Complex double // N:ok
+    inline float64x2_t operator()(Grid::ComplexD *a){
+      double tmp[2]={a[0].imag(),a[0].real()};
+      return vld1q_f64(tmp);
     }
-    // Real float 
+    // Real float // N:ok
     inline float32x4_t operator()(float *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
+      return vld1q_f32(tmp);
     }
-    // Real double
-    inline float32x4_t operator()(double *a){
-      float32x4_t foo;
-      return foo;
+    // Real double // N:ok
+    inline float64x2_t operator()(double *a){
+      double tmp[2]={a[1],a[0]};
+      return vld1q_f64(tmp);
     }
-    // Integer
+    // Integer // N:ok
     inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(a);
     }
-
-
   };
 
+  // N:leaving as is
   template <typename Out_type, typename In_type>
   struct Reduce{
     //Need templated class to overload output type
     //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
       printf("Error, using wrong Reduce function\n");
       exit(1);
       return 0;
@@ -184,26 +194,98 @@ namespace Optimization {
     }
   };
 
+  struct MultRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vmulq_f32(re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vmulq_f64(re, b);
+    }
+  };
+
+  struct MaddRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vfmaq_f32(c, re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vfmaq_f64(c, re, b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vdivq_f32(a, b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vdivq_f64(a, b);
+    }
+  };
+
   struct MultComplex{
     // Complex float
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+
+      float32x4_t r0, r1, r2, r3, r4;
+
+      // a = ar ai Ar Ai
+      // b = br bi Br Bi
+      // collect real/imag part, negate bi and Bi
+      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
+      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
+      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
+
+      // the fun part
+      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
+      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
+
+      // no fma, use mul and add
+      //float32x4_t r5;
+      //r5 = vmulq_f32(r0, a);
+      //return vaddq_f32(r4, r5);
     }
     // Complex double
     inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
-      return foo;
+
+      float64x2_t r0, r1, r2, r3, r4;
+
+      // b = br bi
+      // collect real/imag part, negate bi
+      r0 = vtrn1q_f64(b, b);       //  br  br
+      r1 = vnegq_f64(b);           // -br -bi
+      r2 = vtrn2q_f64(b, r1);      //  bi -bi
+
+      // the fun part
+      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
+      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
+
+      // no fma, use mul and add
+      //float64x2_t r5;
+      //r5 = vmulq_f64(r0, a);
+      //return vaddq_f64(r4, r5);
     }
   };
 
   struct Mult{
     // Real float
     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
+      return vfmaq_f32(a, b, c);
     }
     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
+      return vfmaq_f64(a, b, c);
     }
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
       return vmulq_f32(a,b);
@@ -221,89 +303,275 @@ namespace Optimization {
   struct Conj{
     // Complex single
     inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
     }
     // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
-    // return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in){
+
+      float64x2_t r0, r1;
+      r0 = vextq_f64(in, in, 1);    //  ai  ar
+      r1 = vnegq_f64(r0);           // -ai -ar
+      return vextq_f64(r0, r1, 1);  //  ar -ai
+    }
     // do not define for integer input
   };
 
   struct TimesMinusI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
+      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
     }
     //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return in;
-    //}
-
-
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> b -ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(in, tmp, 1);
+    }
   };
 
   struct TimesI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
-      return in;
+      // ar ai br bi -> -ai ar -bi br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
     }
     //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> -b ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(tmp, in, 1);
+    }
+  };
+
+  struct Permute{
+
+    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
+      // AB CD -> CD AB
+      return vextq_f32(in, in, 2);
+    };
+    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
+      // AB CD -> BA DC
+      return vrev64q_f32(in);
+    };
+    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+
+    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
+      // AB -> BA
+      return vextq_f64(in, in, 1);
+    };
+    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+
+  };
+
+  struct Rotate{
+
+    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB CD -> AB CD
+        return tRotate<0>(in);
+        break;
+      case 1: // AB CD -> BC DA
+        return tRotate<1>(in);
+        break;
+      case 2: // AB CD -> CD AB
+        return tRotate<2>(in);
+        break;
+      case 3: // AB CD -> DA BC
+        return tRotate<3>(in);
+        break;
+      default: assert(0);
+      }
+    }
+    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB -> AB
+        return tRotate<0>(in);
+        break;
+      case 1: // AB -> BA
+        return tRotate<1>(in);
+        break;
+      default: assert(0);
+      }
+    }
+
+// working, but no restriction on n
+//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
+//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
+
+// restriction on n
+    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
+    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
+
+  };
+
+  struct PrecisionChange {
+
+    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
+      float16x4_t h = vcvt_f16_f32(a);
+      return vcvt_high_f16_f32(h, b);
+    }
+    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
+      sb = vcvt_high_f32_f16(h);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
+      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // workaround for clang
+      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
+      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
+      sa = vcvt_high_f32_f16(h1);
+    }
+    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
+      float32x2_t s = vcvt_f32_f64(a);
+      return vcvt_high_f32_f64(s, b);
+
+    }
+    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
+      b = vcvt_high_f64_f32(s);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      float32x4_t s1 = vextq_f32(s, s, 2);
+      a = vcvt_high_f64_f32(s1);
+
+    }
+    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
+      float32x4_t s1 = DtoS(a, b);
+      float32x4_t s2 = DtoS(c, d);
+      return StoH(s1, s2);
+    }
+    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
+      float32x4_t s1, s2;
+      HtoS(h, s1, s2);
+      StoD(s1, a, b);
+      StoD(s2, c, d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+
+  struct Exchange{
+    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: ABEF
+      // in2: EFGH -> out2: CDGH
+
+      // z: CDAB
+      float32x4_t z = vextq_f32(in1, in1, 2);
+      // out1: ABEF
+      out1 = vextq_f32(z, in2, 2);
+
+      // z: GHEF
+      z = vextq_f32(in2, in2, 2);
+      // out2: CDGH
+      out2 = vextq_f32(in1, z, 2);
+    };
+
+    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: AECG
+      // in2: EFGH -> out2: BFDH
+      out1 = vtrn1q_f32(in1, in2);
+      out2 = vtrn2q_f32(in1, in2);
+    };
+    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    // double precision
+    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      // in1: AB -> out1: AC
+      // in2: CD -> out2: BD
+      out1 = vzip1q_f64(in1, in2);
+      out2 = vzip2q_f64(in1, in2);
+    };
+    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
   };
 
   //////////////////////////////////////////////
   // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
 
-  }; 
 
   //Complex float Reduce
   template<>
   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
+    v1 = Optimization::Permute::Permute0(in);
+    v1 = vaddq_f32(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
   }
   //Real float Reduce
   template<>
   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
-    float32x2_t low = vget_low_f32(in);
-    float32x2_t tmp = vadd_f32(low, high);
-    float32x2_t sum = vpadd_f32(tmp, tmp);
-    return vget_lane_f32(sum,0);
+    return vaddvq_f32(in);
   }
-  
-  
+
+
   //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
   }
-  
+
   //Real double Reduce
   template<>
   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
-    return vgetq_lane_f64(sum,0);
+    return vaddvq_f64(in);
   }
 
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
     // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
     assert(0);
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
+// Here assign types
 
+// typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef float16x8_t  SIMD_Htype; // Half precision type
   typedef float32x4_t  SIMD_Ftype; // Single precision type
   typedef float64x2_t  SIMD_Dtype; // Double precision type
   typedef uint32x4_t   SIMD_Itype; // Integer type
@@ -312,13 +580,6 @@ namespace Grid {
   inline void prefetch_HINT_T0(const char *ptr){};
 
 
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
-
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
   typedef Optimization::Vstore   VstoreSIMD;
@@ -326,16 +587,21 @@ namespace Grid {
   typedef Optimization::Vstream  VstreamSIMD;
   template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 
- 
+
 
 
   // Arithmetic operations
   typedef Optimization::Sum         SumSIMD;
   typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
   typedef Optimization::Mult        MultSIMD;
   typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
   typedef Optimization::Conj        ConjSIMD;
   typedef Optimization::TimesMinusI TimesMinusISIMD;
   typedef Optimization::TimesI      TimesISIMD;
 
 }
+
+//#endif // ARM_NEON
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 0048382f..424b5573 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -328,15 +328,15 @@ class Grid_simd {
   ///////////////////////////////////////
 
   //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC push_options 
-  //#pragma GCC optimize ("O0") 
+  //#pragma GCC push_options
+  //#pragma GCC optimize ("O0")
   //#endif
   template <class functor>
   friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
     Grid_simd ret;
     Grid_simd::conv_t conv;
     Grid_simd::scalar_type s;
-    
+
     conv.v = v.v;
     for (int i = 0; i < Nsimd(); i++) {
       s = conv.s[i];
@@ -368,7 +368,7 @@ class Grid_simd {
   //#pragma GCC pop_options
   //#endif
   ///////////////////////
-  // Exchange 
+  // Exchange
   // Al Ah , Bl Bh -> Al Bl Ah,Bh
   ///////////////////////
   friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
@@ -379,7 +379,7 @@ class Grid_simd {
       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
     } else if(n==1) {
       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) { 
+    } else if(n==0) {
       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
     }
   }
@@ -406,7 +406,7 @@ class Grid_simd {
       int dist = perm & 0xF;
       y = rotate(b, dist);
       return;
-    } 
+    }
     else if(perm==3) permute3(y, b);
     else if(perm==2) permute2(y, b);
     else if(perm==1) permute1(y, b);
@@ -425,7 +425,7 @@ class Grid_simd {
   }
 
 
-  
+
 };  // end of Grid_simd class definition
 
 
@@ -451,29 +451,29 @@ inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
   return ret;
 }
-template <class S, class V, IfNotComplex<S> =0> 
+template <class S, class V, IfNotComplex<S> =0>
 inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,nrot);
 }
-template <class S, class V, IfComplex<S> =0> 
+template <class S, class V, IfComplex<S> =0>
 inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 }
 
-template <class S, class V> 
+template <class S, class V>
 inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   vsplat(ret,typepun[lane]);
-}    
-template <class S, class V, IfComplex<S> =0> 
+}
+template <class S, class V, IfComplex<S> =0>
 inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}    
+}
 
 
 
@@ -604,13 +604,27 @@ inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
   return ret;
 };
+// TEST for Test_simd
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(std::complex<S> a, std::complex<S> b) {
+  Grid_simd<S, V> ret;
+  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
   Grid_simd<S, V> ret;
   ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
   return ret;
 };
-
+// TEST for Test_simd
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(std::complex<S> a, std::complex<S> b) {
+  Grid_simd<S, V> ret;
+  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
 
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
@@ -640,7 +654,7 @@ inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret = a * conjugate(b) ;
   den = b * conjugate(b) ;
 
-  
+
   auto real_den = toReal(den);
 
   ret.v=binary<V>(ret.v, real_den.v, DivSIMD());