Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-10-25 10:09:34 +01:00 · 2017-06-30 10:53:50 +01:00
parent b5a6e4f1fd 09d09d0fe5
commit a0be3f7330
5 changed files with 385 additions and 114 deletions
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 License: GPL v2.
-Last update Nov 2016.
+Last update June 2017.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
@@ -78,13 +78,16 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see https://arxiv.org/abs/1512.03487 for more detail.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 ### Required libraries
 Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
 ### Quick start
 First, start by cloning the repository:
@@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 Alternatively, some CPU codenames can be directly used:
--- a/configure.ac
+++ b/configure.ac
@@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      NEONv8)
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
        SIMD_FLAGS='-march=armv8-a';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    if (maxTau - taus < epsilon){
        epsilon = maxTau-taus;
    }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
    GaugeField Z(U._grid);
    GaugeField Zprime(U._grid);
    GaugeField tmp(U._grid), Uprime(U._grid);
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    // adjust integration step
    taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 }
@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
    out = in;
    for (unsigned int step = 1; step <= Nstep; step++) {
        auto start = std::chrono::high_resolution_clock::now();
        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
        evolve_step(out);
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> diff = end - start;
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
    unsigned int step = 0;
    do{
        step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
        evolve_step_adaptive(out, maxTau);
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
            << step << "  "
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
@@ -6,8 +6,9 @@
    Copyright (C) 2015
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Nils Meyer <nils.meyer@ur.de>
-Author: neo <cossu@post.kek.jp>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
-  Experimental - Using intrinsics - DEVELOPING! 
+/*
  ARMv8 NEON intrinsics layer by
  Nils Meyer <nils.meyer@ur.de>,
  University of Regensburg, Germany
  SFB/TRR55
 */
 // Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
 #ifndef GEN_SIMD_WIDTH
 #define GEN_SIMD_WIDTH 16u
 #endif
 #include "Grid_generic_types.h"
 #include <arm_neon.h>
-// ARMv8 supports double precision
+namespace Grid {
 namespace Optimization {
  template<class vtype>
@@ -46,14 +53,18 @@ namespace Optimization {
    float32x4_t f;
    vtype v;
  };
  union u128f {
    float32x4_t v;
    float f[4];
  };
  union u128d {
    float64x2_t v;
-    double f[4];
+    double f[2];
  };
  // half precision
  union u128h {
    float16x8_t v;
    uint16_t f[8];
  };
  struct Vsplat{
@@ -64,20 +75,20 @@ namespace Optimization {
    }
    // Real float
    inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
    }
    //Complex double
-    inline float32x4_t operator()(double a, double b){
+    inline float64x2_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
+      double tmp[2]={a,b};
-      return vld1q_f32(tmp);
+      return vld1q_f64(tmp);
    }
-    //Real double
+    //Real double // N:tbc
-    inline float32x4_t operator()(double a){
+    inline float64x2_t operator()(double a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f64(a);
    }
-    //Integer
+    //Integer // N:tbc
    inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
    }
  };
@@ -87,8 +98,8 @@ namespace Optimization {
      vst1q_f32(F, a);
    }
    //Double
-    inline void operator()(float32x4_t a, double* D){
+    inline void operator()(float64x2_t a, double* D){
-      vst1q_f32((float*)D, a);
+      vst1q_f64(D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
@@ -97,54 +108,54 @@ namespace Optimization {
  };
-  struct Vstream{
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
-    //Float
+    //Float // N:generic
    inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
    }
-    //Double
+    //Double // N:generic
-    inline void operator()(double * a, float32x4_t b){
+    inline void operator()(double * a, float64x2_t b){
-  
+      memcpy(a,&b,2*sizeof(double));
    }
  };
  // Nils: Vset untested; not used currently in Grid at all;
  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
  struct Vset{
-    // Complex float 
+    // Complex float // N:ok
    inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
-      return foo;
+      return vld1q_f32(tmp);
    }
-    // Complex double 
+    // Complex double // N:ok
-    inline float32x4_t operator()(Grid::ComplexD *a){
+    inline float64x2_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
+      double tmp[2]={a[0].imag(),a[0].real()};
-      return foo;
+      return vld1q_f64(tmp);
    }
-    // Real float 
+    // Real float // N:ok
    inline float32x4_t operator()(float *a){
-      float32x4_t foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
-      return foo;
+      return vld1q_f32(tmp);
    }
-    // Real double
+    // Real double // N:ok
-    inline float32x4_t operator()(double *a){
+    inline float64x2_t operator()(double *a){
-      float32x4_t foo;
+      double tmp[2]={a[1],a[0]};
-      return foo;
+      return vld1q_f64(tmp);
    }
-    // Integer
+    // Integer // N:ok
    inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
+      return vld1q_dup_u32(a);
      return foo;
    }
  };
  // N:leaving as is
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
    //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
      printf("Error, using wrong Reduce function\n");
      exit(1);
      return 0;
@@ -184,26 +195,98 @@ namespace Optimization {
    }
  };
  struct MultRealPart{
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      float32x4_t re = vtrn1q_f32(a, a);
      return vmulq_f32(re, b);
    }
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
      float64x2_t re = vzip1q_f64(a, a);
      return vmulq_f64(re, b);
    }
  };
  struct MaddRealPart{
    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
      float32x4_t re = vtrn1q_f32(a, a);
      return vfmaq_f32(c, re, b);
    }
    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
      float64x2_t re = vzip1q_f64(a, a);
      return vfmaq_f64(c, re, b);
    }
  };
  struct Div{
    // Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vdivq_f32(a, b);
    }
    // Real double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
      return vdivq_f64(a, b);
    }
  };
  struct MultComplex{
    // Complex float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
+
-      return foo;
+      float32x4_t r0, r1, r2, r3, r4;
      // a = ar ai Ar Ai
      // b = br bi Br Bi
      // collect real/imag part, negate bi and Bi
      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
      // the fun part
      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
      // fma(a,b,c) = a+b*c
      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
      // no fma, use mul and add
      //float32x4_t r5;
      //r5 = vmulq_f32(r0, a);
      //return vaddq_f32(r4, r5);
    }
    // Complex double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
+
-      return foo;
+      float64x2_t r0, r1, r2, r3, r4;
      // b = br bi
      // collect real/imag part, negate bi
      r0 = vtrn1q_f64(b, b);       //  br  br
      r1 = vnegq_f64(b);           // -br -bi
      r2 = vtrn2q_f64(b, r1);      //  bi -bi
      // the fun part
      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
      // fma(a,b,c) = a+b*c
      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
      // no fma, use mul and add
      //float64x2_t r5;
      //r5 = vmulq_f64(r0, a);
      //return vaddq_f64(r4, r5);
    }
  };
  struct Mult{
    // Real float
    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
      return vfmaq_f32(a, b, c);
    }
    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
      return vfmaq_f64(a, b, c);
    }
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vmulq_f32(a,b);
@@ -221,89 +304,275 @@ namespace Optimization {
  struct Conj{
    // Complex single
    inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
      float32x4_t r0, r1;
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
+    inline float64x2_t operator()(float64x2_t in){
-    // return 0;
+
-    //}
+      float64x2_t r0, r1;
      r0 = vextq_f64(in, in, 1);    //  ai  ar
      r1 = vnegq_f64(r0);           // -ai -ar
      return vextq_f64(r0, r1, 1);  //  ar -ai
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
      float32x4_t r0, r1;
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
-    //  return in;
+      // a ib -> b -ia
-    //}
+      float64x2_t tmp;
-
+      tmp = vnegq_f64(in);
-
+      return vextq_f64(in, tmp, 1);
    }
  };
  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
+      // ar ai br bi -> -ai ar -bi br
-      return in;
+      float32x4_t r0, r1;
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
-    //  return 0;
+      // a ib -> -b ia
-    //}
+      float64x2_t tmp;
      tmp = vnegq_f64(in);
      return vextq_f64(tmp, in, 1);
    }
  };
  struct Permute{
    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
      // AB CD -> CD AB
      return vextq_f32(in, in, 2);
    };
    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
      // AB CD -> BA DC
      return vrev64q_f32(in);
    };
    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
      return in;
    };
    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
      return in;
    };
    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
      // AB -> BA
      return vextq_f64(in, in, 1);
    };
    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
      return in;
    };
    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
      return in;
    };
    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
      return in;
    };
  };
  struct Rotate{
    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
      switch(n){
      case 0: // AB CD -> AB CD
        return tRotate<0>(in);
        break;
      case 1: // AB CD -> BC DA
        return tRotate<1>(in);
        break;
      case 2: // AB CD -> CD AB
        return tRotate<2>(in);
        break;
      case 3: // AB CD -> DA BC
        return tRotate<3>(in);
        break;
      default: assert(0);
      }
    }
    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
      switch(n){
      case 0: // AB -> AB
        return tRotate<0>(in);
        break;
      case 1: // AB -> BA
        return tRotate<1>(in);
        break;
      default: assert(0);
      }
    }
 // working, but no restriction on n
 //    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
 //    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
 // restriction on n
    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
  };
  struct PrecisionChange {
    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
      float16x4_t h = vcvt_f16_f32(a);
      return vcvt_high_f16_f32(h, b);
    }
    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
      sb = vcvt_high_f32_f16(h);
      // there is no direct conversion from lower float32x4_t to float64x2_t
      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
      // workaround for clang
      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
      sa = vcvt_high_f32_f16(h1);
    }
    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
      float32x2_t s = vcvt_f32_f64(a);
      return vcvt_high_f32_f64(s, b);
    }
    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
      b = vcvt_high_f64_f32(s);
      // there is no direct conversion from lower float32x4_t to float64x2_t
      float32x4_t s1 = vextq_f32(s, s, 2);
      a = vcvt_high_f64_f32(s1);
    }
    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
      float32x4_t s1 = DtoS(a, b);
      float32x4_t s2 = DtoS(c, d);
      return StoH(s1, s2);
    }
    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
      float32x4_t s1, s2;
      HtoS(h, s1, s2);
      StoD(s1, a, b);
      StoD(s2, c, d);
    }
  };
  //////////////////////////////////////////////
  // Exchange support
  struct Exchange{
    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      // in1: ABCD -> out1: ABEF
      // in2: EFGH -> out2: CDGH
      // z: CDAB
      float32x4_t z = vextq_f32(in1, in1, 2);
      // out1: ABEF
      out1 = vextq_f32(z, in2, 2);
      // z: GHEF
      z = vextq_f32(in2, in2, 2);
      // out2: CDGH
      out2 = vextq_f32(in1, z, 2);
    };
    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      // in1: ABCD -> out1: AECG
      // in2: EFGH -> out2: BFDH
      out1 = vtrn1q_f32(in1, in2);
      out2 = vtrn2q_f32(in1, in2);
    };
    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      assert(0);
      return;
    };
    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      assert(0);
      return;
    };
    // double precision
    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      // in1: AB -> out1: AC
      // in2: CD -> out2: BD
      out1 = vzip1q_f64(in1, in2);
      out2 = vzip2q_f64(in1, in2);
    };
    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      assert(0);
      return;
    };
    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      assert(0);
      return;
    };
    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      assert(0);
      return;
    };
  };
  //////////////////////////////////////////////
  // Some Template specialization
  template < typename vtype > 
    void permute(vtype &a, vtype b, int perm) {
  }; 
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
    v1 = Optimization::Permute::Permute0(in);
    v1 = vaddq_f32(v1,in);
    u128f conv;    conv.v=v1;
    return Grid::ComplexF(conv.f[0],conv.f[1]);
  }
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
+    return vaddvq_f32(in);
    float32x2_t low = vget_low_f32(in);
    float32x2_t tmp = vadd_f32(low, high);
    float32x2_t sum = vpadd_f32(tmp, tmp);
    return vget_lane_f32(sum,0);
  }
  //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
    return Grid::ComplexD(conv.f[0],conv.f[1]);
  }
  //Real double Reduce
  template<>
  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
+    return vaddvq_f64(in);
    return vgetq_lane_f64(sum,0);
  }
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types
 namespace Grid {
 // typedef Optimization::vech SIMD_Htype; // Reduced precision type
  typedef float16x8_t  SIMD_Htype; // Half precision type
  typedef float32x4_t  SIMD_Ftype; // Single precision type
  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type
@@ -312,13 +581,6 @@ namespace Grid {
  inline void prefetch_HINT_T0(const char *ptr){};
  // Gpermute function
  template < typename VectorSIMD > 
    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
    Optimization::permute(y.v,b.v,perm);
  }
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@@ -332,8 +594,11 @@ namespace Grid {
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX