From 3d04dc33c6c64f2ed761ef62094519a81668aaee Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 13 Jun 2017 13:26:59 +0200
Subject: [PATCH 1/8] ARM neon intrinsics support

---
 configure.ac                  |   3 +
 lib/simd/Grid_generic_types.h |   2 +-
 lib/simd/Grid_neon.h          | 460 +++++++++++++++++++++++++++-------
 lib/simd/Grid_vector_types.h  |  44 ++--
 4 files changed, 396 insertions(+), 113 deletions(-)
diff --git a/configure.ac b/configure.ac
index 62b7545b..20f71128 100644
--- a/configure.ac
+++ b/configure.ac
@@ -244,6 +244,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                            [generic SIMD vector width (in bytes)])
         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
         SIMD_FLAGS='';;
+      NEONv8)
+        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
+        SIMD_FLAGS='';;
       QPX|BGQ)
         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
         SIMD_FLAGS='';;
diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h
index 642f6ffe..eac65e09 100644
--- a/lib/simd/Grid_generic_types.h
+++ b/lib/simd/Grid_generic_types.h
@@ -26,7 +26,7 @@ Author: Antonin Portelli <antonin.portelli@me.com>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-
+#define GEN_SIMD_WIDTH 16
 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
 
 //#define VECTOR_LOOPS
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 7c1ad443..f3f802e7 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,11 +1,12 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
 
     Source file: ./lib/simd/Grid_neon.h
 
     Copyright (C) 2015
 
+Author: Nils Meyer <nils.meyer@ur.de>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 
@@ -27,18 +28,23 @@ Author: neo <cossu@post.kek.jp>
     *************************************************************************************/
     /*  END LEGAL */
 //----------------------------------------------------------------------
-/*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
+/*
+
+  ARMv8 NEON intrinsics layer by
+
+  Nils Meyer <nils.meyer@ur.de>,
+  University of Regensburg, Germany
+  SFB/TRR55
 
-  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
+//#ifndef ARM_NEON
+//#define ARM_NEON
 
+#include "Grid_generic_types.h"
 #include <arm_neon.h>
 
-// ARMv8 supports double precision
-
+namespace Grid {
 namespace Optimization {
 
   template<class vtype>
@@ -46,16 +52,20 @@ namespace Optimization {
     float32x4_t f;
     vtype v;
   };
-
   union u128f {
     float32x4_t v;
     float f[4];
   };
   union u128d {
     float64x2_t v;
-    double f[4];
+    double f[2];
   };
-  
+  // half precision
+  union u128h {
+    float16x8_t v;
+    uint16_t f[8];
+  };
+
   struct Vsplat{
     //Complex float
     inline float32x4_t operator()(float a, float b){
@@ -64,31 +74,31 @@ namespace Optimization {
     }
     // Real float
     inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
     }
     //Complex double
-    inline float32x4_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
-      return vld1q_f32(tmp);
+    inline float64x2_t operator()(double a, double b){
+      double tmp[2]={a,b};
+      return vld1q_f64(tmp);
     }
-    //Real double
-    inline float32x4_t operator()(double a){
-      return vld1q_dup_f32(&a);
+    //Real double // N:tbc
+    inline float64x2_t operator()(double a){
+      return vdupq_n_f64(a);
     }
-    //Integer
+    //Integer // N:tbc
     inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
     }
   };
 
   struct Vstore{
-    //Float 
+    //Float
     inline void operator()(float32x4_t a, float* F){
       vst1q_f32(F, a);
     }
     //Double
-    inline void operator()(float32x4_t a, double* D){
-      vst1q_f32((float*)D, a);
+    inline void operator()(float64x2_t a, double* D){
+      vst1q_f64(D, a);
     }
     //Integer
     inline void operator()(uint32x4_t a, Integer* I){
@@ -97,54 +107,54 @@ namespace Optimization {
 
   };
 
-  struct Vstream{
-    //Float
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+    //Float // N:generic
     inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
     }
-    //Double
-    inline void operator()(double * a, float32x4_t b){
-  
+    //Double // N:generic
+    inline void operator()(double * a, float64x2_t b){
+      memcpy(a,&b,2*sizeof(double));
     }
 
 
   };
 
+  // Nils: Vset untested; not used currently in Grid at all;
+  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
   struct Vset{
-    // Complex float 
+    // Complex float // N:ok
     inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
+      return vld1q_f32(tmp);
     }
-    // Complex double 
-    inline float32x4_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
-      return foo;
+    // Complex double // N:ok
+    inline float64x2_t operator()(Grid::ComplexD *a){
+      double tmp[2]={a[0].imag(),a[0].real()};
+      return vld1q_f64(tmp);
     }
-    // Real float 
+    // Real float // N:ok
     inline float32x4_t operator()(float *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
+      return vld1q_f32(tmp);
     }
-    // Real double
-    inline float32x4_t operator()(double *a){
-      float32x4_t foo;
-      return foo;
+    // Real double // N:ok
+    inline float64x2_t operator()(double *a){
+      double tmp[2]={a[1],a[0]};
+      return vld1q_f64(tmp);
     }
-    // Integer
+    // Integer // N:ok
     inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(a);
     }
-
-
   };
 
+  // N:leaving as is
   template <typename Out_type, typename In_type>
   struct Reduce{
     //Need templated class to overload output type
     //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
       printf("Error, using wrong Reduce function\n");
       exit(1);
       return 0;
@@ -184,26 +194,98 @@ namespace Optimization {
     }
   };
 
+  struct MultRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vmulq_f32(re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vmulq_f64(re, b);
+    }
+  };
+
+  struct MaddRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vfmaq_f32(c, re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vfmaq_f64(c, re, b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vdivq_f32(a, b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vdivq_f64(a, b);
+    }
+  };
+
   struct MultComplex{
     // Complex float
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+
+      float32x4_t r0, r1, r2, r3, r4;
+
+      // a = ar ai Ar Ai
+      // b = br bi Br Bi
+      // collect real/imag part, negate bi and Bi
+      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
+      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
+      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
+
+      // the fun part
+      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
+      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
+
+      // no fma, use mul and add
+      //float32x4_t r5;
+      //r5 = vmulq_f32(r0, a);
+      //return vaddq_f32(r4, r5);
     }
     // Complex double
     inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
-      return foo;
+
+      float64x2_t r0, r1, r2, r3, r4;
+
+      // b = br bi
+      // collect real/imag part, negate bi
+      r0 = vtrn1q_f64(b, b);       //  br  br
+      r1 = vnegq_f64(b);           // -br -bi
+      r2 = vtrn2q_f64(b, r1);      //  bi -bi
+
+      // the fun part
+      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
+      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
+
+      // no fma, use mul and add
+      //float64x2_t r5;
+      //r5 = vmulq_f64(r0, a);
+      //return vaddq_f64(r4, r5);
     }
   };
 
   struct Mult{
     // Real float
     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
+      return vfmaq_f32(a, b, c);
     }
     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
+      return vfmaq_f64(a, b, c);
     }
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
       return vmulq_f32(a,b);
@@ -221,89 +303,275 @@ namespace Optimization {
   struct Conj{
     // Complex single
     inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
     }
     // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
-    // return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in){
+
+      float64x2_t r0, r1;
+      r0 = vextq_f64(in, in, 1);    //  ai  ar
+      r1 = vnegq_f64(r0);           // -ai -ar
+      return vextq_f64(r0, r1, 1);  //  ar -ai
+    }
     // do not define for integer input
   };
 
   struct TimesMinusI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
+      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
     }
     //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return in;
-    //}
-
-
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> b -ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(in, tmp, 1);
+    }
   };
 
   struct TimesI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
-      return in;
+      // ar ai br bi -> -ai ar -bi br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
     }
     //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> -b ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(tmp, in, 1);
+    }
+  };
+
+  struct Permute{
+
+    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
+      // AB CD -> CD AB
+      return vextq_f32(in, in, 2);
+    };
+    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
+      // AB CD -> BA DC
+      return vrev64q_f32(in);
+    };
+    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+
+    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
+      // AB -> BA
+      return vextq_f64(in, in, 1);
+    };
+    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+
+  };
+
+  struct Rotate{
+
+    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB CD -> AB CD
+        return tRotate<0>(in);
+        break;
+      case 1: // AB CD -> BC DA
+        return tRotate<1>(in);
+        break;
+      case 2: // AB CD -> CD AB
+        return tRotate<2>(in);
+        break;
+      case 3: // AB CD -> DA BC
+        return tRotate<3>(in);
+        break;
+      default: assert(0);
+      }
+    }
+    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB -> AB
+        return tRotate<0>(in);
+        break;
+      case 1: // AB -> BA
+        return tRotate<1>(in);
+        break;
+      default: assert(0);
+      }
+    }
+
+// working, but no restriction on n
+//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
+//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
+
+// restriction on n
+    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
+    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
+
+  };
+
+  struct PrecisionChange {
+
+    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
+      float16x4_t h = vcvt_f16_f32(a);
+      return vcvt_high_f16_f32(h, b);
+    }
+    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
+      sb = vcvt_high_f32_f16(h);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
+      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // workaround for clang
+      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
+      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
+      sa = vcvt_high_f32_f16(h1);
+    }
+    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
+      float32x2_t s = vcvt_f32_f64(a);
+      return vcvt_high_f32_f64(s, b);
+
+    }
+    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
+      b = vcvt_high_f64_f32(s);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      float32x4_t s1 = vextq_f32(s, s, 2);
+      a = vcvt_high_f64_f32(s1);
+
+    }
+    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
+      float32x4_t s1 = DtoS(a, b);
+      float32x4_t s2 = DtoS(c, d);
+      return StoH(s1, s2);
+    }
+    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
+      float32x4_t s1, s2;
+      HtoS(h, s1, s2);
+      StoD(s1, a, b);
+      StoD(s2, c, d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+
+  struct Exchange{
+    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: ABEF
+      // in2: EFGH -> out2: CDGH
+
+      // z: CDAB
+      float32x4_t z = vextq_f32(in1, in1, 2);
+      // out1: ABEF
+      out1 = vextq_f32(z, in2, 2);
+
+      // z: GHEF
+      z = vextq_f32(in2, in2, 2);
+      // out2: CDGH
+      out2 = vextq_f32(in1, z, 2);
+    };
+
+    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: AECG
+      // in2: EFGH -> out2: BFDH
+      out1 = vtrn1q_f32(in1, in2);
+      out2 = vtrn2q_f32(in1, in2);
+    };
+    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    // double precision
+    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      // in1: AB -> out1: AC
+      // in2: CD -> out2: BD
+      out1 = vzip1q_f64(in1, in2);
+      out2 = vzip2q_f64(in1, in2);
+    };
+    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
   };
 
   //////////////////////////////////////////////
   // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
 
-  }; 
 
   //Complex float Reduce
   template<>
   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
+    v1 = Optimization::Permute::Permute0(in);
+    v1 = vaddq_f32(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
   }
   //Real float Reduce
   template<>
   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
-    float32x2_t low = vget_low_f32(in);
-    float32x2_t tmp = vadd_f32(low, high);
-    float32x2_t sum = vpadd_f32(tmp, tmp);
-    return vget_lane_f32(sum,0);
+    return vaddvq_f32(in);
   }
-  
-  
+
+
   //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
   }
-  
+
   //Real double Reduce
   template<>
   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
-    return vgetq_lane_f64(sum,0);
+    return vaddvq_f64(in);
   }
 
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
     // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
     assert(0);
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
+// Here assign types
 
+// typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef float16x8_t  SIMD_Htype; // Half precision type
   typedef float32x4_t  SIMD_Ftype; // Single precision type
   typedef float64x2_t  SIMD_Dtype; // Double precision type
   typedef uint32x4_t   SIMD_Itype; // Integer type
@@ -312,13 +580,6 @@ namespace Grid {
   inline void prefetch_HINT_T0(const char *ptr){};
 
 
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
-
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
   typedef Optimization::Vstore   VstoreSIMD;
@@ -326,16 +587,21 @@ namespace Grid {
   typedef Optimization::Vstream  VstreamSIMD;
   template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 
- 
+
 
 
   // Arithmetic operations
   typedef Optimization::Sum         SumSIMD;
   typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
   typedef Optimization::Mult        MultSIMD;
   typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
   typedef Optimization::Conj        ConjSIMD;
   typedef Optimization::TimesMinusI TimesMinusISIMD;
   typedef Optimization::TimesI      TimesISIMD;
 
 }
+
+//#endif // ARM_NEON
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 0048382f..424b5573 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -328,15 +328,15 @@ class Grid_simd {
   ///////////////////////////////////////
 
   //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC push_options 
-  //#pragma GCC optimize ("O0") 
+  //#pragma GCC push_options
+  //#pragma GCC optimize ("O0")
   //#endif
   template <class functor>
   friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
     Grid_simd ret;
     Grid_simd::conv_t conv;
     Grid_simd::scalar_type s;
-    
+
     conv.v = v.v;
     for (int i = 0; i < Nsimd(); i++) {
       s = conv.s[i];
@@ -368,7 +368,7 @@ class Grid_simd {
   //#pragma GCC pop_options
   //#endif
   ///////////////////////
-  // Exchange 
+  // Exchange
   // Al Ah , Bl Bh -> Al Bl Ah,Bh
   ///////////////////////
   friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
@@ -379,7 +379,7 @@ class Grid_simd {
       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
     } else if(n==1) {
       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) { 
+    } else if(n==0) {
       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
     }
   }
@@ -406,7 +406,7 @@ class Grid_simd {
       int dist = perm & 0xF;
       y = rotate(b, dist);
       return;
-    } 
+    }
     else if(perm==3) permute3(y, b);
     else if(perm==2) permute2(y, b);
     else if(perm==1) permute1(y, b);
@@ -425,7 +425,7 @@ class Grid_simd {
   }
 
 
-  
+
 };  // end of Grid_simd class definition
 
 
@@ -451,29 +451,29 @@ inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
   return ret;
 }
-template <class S, class V, IfNotComplex<S> =0> 
+template <class S, class V, IfNotComplex<S> =0>
 inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,nrot);
 }
-template <class S, class V, IfComplex<S> =0> 
+template <class S, class V, IfComplex<S> =0>
 inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 }
 
-template <class S, class V> 
+template <class S, class V>
 inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   vsplat(ret,typepun[lane]);
-}    
-template <class S, class V, IfComplex<S> =0> 
+}
+template <class S, class V, IfComplex<S> =0>
 inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}    
+}
 
 
 
@@ -604,13 +604,27 @@ inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
   return ret;
 };
+// TEST for Test_simd
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(std::complex<S> a, std::complex<S> b) {
+  Grid_simd<S, V> ret;
+  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
   Grid_simd<S, V> ret;
   ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
   return ret;
 };
-
+// TEST for Test_simd
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(std::complex<S> a, std::complex<S> b) {
+  Grid_simd<S, V> ret;
+  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
 
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
@@ -640,7 +654,7 @@ inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret = a * conjugate(b) ;
   den = b * conjugate(b) ;
 
-  
+
   auto real_den = toReal(den);
 
   ret.v=binary<V>(ret.v, real_den.v, DivSIMD());

From bf729766ddd36c9ebe8c6be35d7527353aff2963 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 27 Jun 2017 20:32:24 +0200
Subject: [PATCH 2/8] removed collision with QPX implementation

---
 lib/simd/Grid_generic_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h
index eac65e09..642f6ffe 100644
--- a/lib/simd/Grid_generic_types.h
+++ b/lib/simd/Grid_generic_types.h
@@ -26,7 +26,7 @@ Author: Antonin Portelli <antonin.portelli@me.com>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-#define GEN_SIMD_WIDTH 16
+
 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
 
 //#define VECTOR_LOOPS

From e43a8b6b8ad52b88ed8e9ae1c140dc34a3da18d4 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 27 Jun 2017 20:58:48 +0200
Subject: [PATCH 3/8] removed comments

---
 Grid_vector_types.h | 857 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 857 insertions(+)
 create mode 100644 Grid_vector_types.h

diff --git a/Grid_vector_types.h b/Grid_vector_types.h
new file mode 100644
index 00000000..e05fecc4
--- /dev/null
+++ b/Grid_vector_types.h
@@ -0,0 +1,857 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/simd/Grid_vector_type.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//---------------------------------------------------------------------------
+/*! @file Grid_vector_types.h
+  @brief Defines templated class Grid_simd to deal with inner vector types
+*/
+// Time-stamp: <2015-07-10 17:45:33 neo>
+//---------------------------------------------------------------------------
+#ifndef GRID_VECTOR_TYPES
+#define GRID_VECTOR_TYPES
+
+#ifdef GEN
+#include "Grid_generic.h"
+#endif
+#ifdef SSE4
+#include "Grid_sse4.h"
+#endif
+#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
+#include "Grid_avx.h"
+#endif
+#if defined AVX512
+#include "Grid_avx512.h"
+#endif
+#if defined IMCI
+#include "Grid_imci.h"
+#endif
+#ifdef NEONv8
+#include "Grid_neon.h"
+#endif
+#if defined QPX
+#include "Grid_qpx.h"
+#endif
+
+#include "l1p.h"
+
+namespace Grid {
+
+//////////////////////////////////////
+// To take the floating point type of real/complex type
+//////////////////////////////////////
+template <typename T>
+struct RealPart {
+  typedef T type;
+};
+template <typename T>
+struct RealPart<std::complex<T> > {
+  typedef T type;
+};
+
+#include <type_traits>
+
+//////////////////////////////////////
+// demote a vector to real type
+//////////////////////////////////////
+// type alias used to simplify the syntax of std::enable_if
+template <typename T> using Invoke = typename T::type;
+template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
+
+////////////////////////////////////////////////////////
+// Check for complexity with type traits
+template <typename T> struct is_complex : public std::false_type {};
+template <> struct is_complex<std::complex<double> > : public std::true_type {};
+template <> struct is_complex<std::complex<float> > : public std::true_type {};
+
+template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
+template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
+template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
+
+template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
+template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
+template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
+
+////////////////////////////////////////////////////////
+// Define the operation templates functors
+// general forms to allow for vsplat syntax
+// need explicit declaration of types when used since
+// clang cannot automatically determine the output type sometimes
+template <class Out, class Input1, class Input2, class Input3, class Operation>
+Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
+  return op(src_1, src_2, src_3);
+}
+
+template <class Out, class Input1, class Input2, class Operation>
+Out binary(Input1 src_1, Input2 src_2, Operation op) {
+  return op(src_1, src_2);
+}
+
+template <class Out, class Input, class Operation>
+Out unary(Input src, Operation op) {
+  return op(src);
+}
+///////////////////////////////////////////////
+
+/*
+  @brief Grid_simd class for the SIMD vector type operations
+ */
+template <class Scalar_type, class Vector_type>
+class Grid_simd {
+ public:
+  typedef typename RealPart<Scalar_type>::type Real;
+  typedef Vector_type vector_type;
+  typedef Scalar_type scalar_type;
+
+  typedef union conv_t_union {
+    Vector_type v;
+    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
+    conv_t_union(){};
+  } conv_t;
+
+  Vector_type v;
+
+  static inline constexpr int Nsimd(void) {
+    return sizeof(Vector_type) / sizeof(Scalar_type);
+  }
+
+  Grid_simd &operator=(const Grid_simd &&rhs) {
+    v = rhs.v;
+    return *this;
+  };
+  Grid_simd &operator=(const Grid_simd &rhs) {
+    v = rhs.v;
+    return *this;
+  };  // faster than not declaring it and leaving to the compiler
+  Grid_simd() = default;
+  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
+  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
+
+  /////////////////////////////
+  // Constructors
+  /////////////////////////////
+  Grid_simd &operator=(Zero &z) {
+    vzero(*this);
+    return (*this);
+  }
+
+  // Enable if complex type
+  template <typename S = Scalar_type>
+  Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
+    vsplat(*this, a);
+  };
+
+  Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
+
+  ///////////////////////////////////////////////
+  // mac, mult, sub, add, adj
+  ///////////////////////////////////////////////
+
+  // FIXME -- alias this to an inline MAC struct.
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Scalar_type *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Scalar_type *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Scalar_type *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // FIXME:  gonna remove these load/store, get, set, prefetch
+  ////////////////////////////////////////////////////////////////////////
+  friend inline void vset(Grid_simd &ret, Scalar_type *a) {
+    ret.v = unary<Vector_type>(a, VsetSIMD());
+  }
+
+  ///////////////////////
+  // Vstore
+  ///////////////////////
+  friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
+    binary<void>(ret.v, (Real *)a, VstoreSIMD());
+  }
+
+  ///////////////////////
+  // Vprefetch
+  ///////////////////////
+  friend inline void vprefetch(const Grid_simd &v) {
+    prefetch_HINT_T0((const char *)&v.v);
+  }
+
+  ///////////////////////
+  // Reduce
+  ///////////////////////
+  friend inline Scalar_type Reduce(const Grid_simd &in) {
+    return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
+  }
+
+  ////////////////////////////
+  // operator scalar * simd
+  ////////////////////////////
+  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va * b;
+  }
+  friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
+    return a * b;
+  }
+
+  //////////////////////////////////
+  // Divides
+  //////////////////////////////////
+  friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va / b;
+  }
+  friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
+    Grid_simd va;
+    vsplat(va, a);
+    return b / a;
+  }
+
+  ///////////////////////
+  // Unary negation
+  ///////////////////////
+  friend inline Grid_simd operator-(const Grid_simd &r) {
+    Grid_simd ret;
+    vzero(ret);
+    ret = ret - r;
+    return ret;
+  }
+  // *=,+=,-= operators
+  inline Grid_simd &operator*=(const Grid_simd &r) {
+    *this = (*this) * r;
+    return *this;
+    // return (*this)*r; ?
+  }
+  inline Grid_simd &operator+=(const Grid_simd &r) {
+    *this = *this + r;
+    return *this;
+  }
+  inline Grid_simd &operator-=(const Grid_simd &r) {
+    *this = *this - r;
+    return *this;
+  }
+
+  ///////////////////////////////////////
+  // Not all functions are supported
+  // through SIMD and must breakout to
+  // scalar type and back again. This
+  // provides support
+  ///////////////////////////////////////
+
+  template <class functor>
+  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
+    Grid_simd ret;
+    Grid_simd::conv_t conv;
+    Grid_simd::scalar_type s;
+    
+    conv.v = v.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      s = conv.s[i];
+      conv.s[i] = func(s);
+    }
+    ret.v = conv.v;
+    return ret;
+  }
+  template <class functor>
+  friend inline Grid_simd SimdApplyBinop(const functor &func,
+                                         const Grid_simd &x,
+                                         const Grid_simd &y) {
+    Grid_simd ret;
+    Grid_simd::conv_t cx;
+    Grid_simd::conv_t cy;
+    Grid_simd::scalar_type sx,sy;
+
+    cx.v = x.v;
+    cy.v = y.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      sx = cx.s[i];
+      sy = cy.s[i];
+      cx.s[i] = func(sx,sy);
+    }
+    ret.v = cx.v;
+    return ret;
+  }
+  ///////////////////////
+  // Exchange 
+  // Al Ah , Bl Bh -> Al Bl Ah,Bh
+  ///////////////////////
+  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
+  {
+    if       (n==3) {
+      Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==2) {
+      Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==1) {
+      Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==0) { 
+      Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////
+  // General permute; assumes vector length is same across
+  // all subtypes; may not be a good assumption, but could
+  // add the vector width as a template param for BG/Q for example
+  ////////////////////////////////////////////////////////////////////
+  friend inline void permute0(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute0(b.v);
+  }
+  friend inline void permute1(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute1(b.v);
+  }
+  friend inline void permute2(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute2(b.v);
+  }
+  friend inline void permute3(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute3(b.v);
+  }
+  friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
+    if (perm & RotateBit) {
+      int dist = perm & 0xF;
+      y = rotate(b, dist);
+      return;
+    } 
+    else if(perm==3) permute3(y, b);
+    else if(perm==2) permute2(y, b);
+    else if(perm==1) permute1(y, b);
+    else if(perm==0) permute0(y, b);
+  }
+
+  ///////////////////////////////
+  // Getting single lanes
+  ///////////////////////////////
+  inline Scalar_type getlane(int lane) {
+    return ((Scalar_type*)&v)[lane];
+  }
+
+  inline void putlane(const Scalar_type &S, int lane){
+    ((Scalar_type*)&v)[lane] = S;
+  }
+
+
+  
+};  // end of Grid_simd class definition
+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
+////////////////////////////////////////////////////////////////////
+// General rotate
+////////////////////////////////////////////////////////////////////
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  ret.v = Optimization::Rotate::rotate(b.v, nrot);
+  return ret;
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> =0> 
+inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+{
+  nrot = nrot % Grid_simd<S,V>::Nsimd();
+  ret.v = Optimization::Rotate::rotate(b.v,nrot);
+}
+template <class S, class V, IfComplex<S> =0> 
+inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+{
+  nrot = nrot % Grid_simd<S,V>::Nsimd();
+  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
+}
+
+template <class S, class V> 
+inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  vsplat(ret,typepun[lane]);
+}    
+template <class S, class V, IfComplex<S> =0> 
+inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
+}    
+
+
+
+///////////////////////
+// Splat
+///////////////////////
+
+// this is only for the complex version
+template <class S, class V, IfComplex<S> = 0, class ABtype>
+inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
+  ret.v = binary<V>(a, b, VsplatSIMD());
+}
+
+// overload if complex
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), imag(c));
+}
+template <class S, class V>
+inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), real(c));
+}
+
+// if real fill with a, if complex fill with a in the real part (first function
+// above)
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
+  ret.v = unary<V>(a, VsplatSIMD());
+}
+//////////////////////////
+
+///////////////////////////////////////////////
+// Initialise to 1,0,i for the correct types
+///////////////////////////////////////////////
+// For complex types
+template <class S, class V, IfComplex<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, 0.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 0.0));
+}  // use xor?
+template <class S, class V, IfComplex<S> = 0>
+inline void vcomplex_i(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 1.0));
+}
+
+template <class S, class V, IfComplex<S> = 0>
+inline void visign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, -1.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vrsign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(-1.0, 1.0));
+}
+
+// if not complex overload here
+template <class S, class V, IfReal<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0));
+}
+template <class S, class V, IfReal<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0));
+}
+
+// For integral types
+template <class S, class V, IfInteger<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, 1);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vtrue(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0xFFFFFFFF);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vfalse(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V>
+inline void zeroit(Grid_simd<S, V> &z) {
+  vzero(z);
+}
+
+///////////////////////
+// Vstream
+///////////////////////
+template <class S, class V, IfReal<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  binary<void>((S *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  typedef typename S::value_type T;
+  binary<void>((T *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  out = in;
+}
+
+////////////////////////////////////
+// Arithmetic operator overloads +,-,*
+////////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SumSIMD());
+  return ret;
+};
+
+template <class S, class V>
+inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SubSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
+  Grid_simd<S, V> ret;
+  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
+  return ret;
+};
+
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  typedef Grid_simd<S, V> simd;
+
+  simd ret;
+  simd den;
+  typename simd::conv_t conv;
+
+  ret = a * conjugate(b) ;
+  den = b * conjugate(b) ;
+
+  
+  auto real_den = toReal(den);
+
+  ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
+
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, DivSIMD());
+  return ret;
+};
+
+///////////////////////
+// Conjugate
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  ret.v = unary<V>(in.v, ConjSIMD());
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  return in;  // for real objects
+}
+// Suppress adj for integer types... // odd; why conjugate above but not adj??
+template <class S, class V, IfNotInteger<S> = 0>
+inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
+  return conjugate(in);
+}
+
+///////////////////////
+// timesMinusI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesMinusI(ret, in);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+///////////////////////
+// timesI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesI(ret, in);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+/////////////////////
+// Inner, outer
+/////////////////////
+
+template <class S, class V>
+inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return conjugate(l) * r;
+}
+template <class S, class V>
+inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return l * conjugate(r);
+}
+
+template <class S, class V>
+inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
+  return arg;
+}
+
+////////////////////////////////////////////////////////////
+// copy/splat complex real parts into real;
+// insert real into complex and zero imag;
+////////////////////////////////////////////////////////////
+
+// real = toReal( complex )
+template <class S, class V, IfReal<S> = 0>
+inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
+  typedef Grid_simd<S, V> simd;
+  simd ret;
+  typename simd::conv_t conv;
+  conv.v = in.v;  // copy the vector content (bytewise)
+  for (int i = 0; i < simd::Nsimd(); i += 2) {
+    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
+  }
+  ret.v = conv.v;
+  return ret;
+}
+
+// complex = toComplex( real )
+template <class R, class V, IfReal<R> = 0>  // must be a real arg
+inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
+  typedef Grid_simd<R, V> Rsimd;
+  typedef Grid_simd<std::complex<R>, V> Csimd;
+  typename Rsimd::conv_t conv;  // address as real
+
+  conv.v = in.v;
+  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
+    // indicating the SIMD grids of real and imag assignment did not correctly
+    // match
+    conv.s[i + 1] = 0.0;  // zero imaginary parts
+  }
+  Csimd ret;
+  ret.v = conv.v;
+  return ret;
+}
+
+///////////////////////////////
+// Define available types
+///////////////////////////////
+typedef Grid_simd<float, SIMD_Ftype> vRealF;
+typedef Grid_simd<double, SIMD_Dtype> vRealD;
+typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
+typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
+typedef Grid_simd<Integer, SIMD_Itype> vInteger;
+
+// Half precision; no arithmetic support
+typedef Grid_simd<uint16_t, SIMD_Htype>               vRealH;
+typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
+
+inline void precisionChange(vRealF    *out,vRealD    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
+  }
+}
+inline void precisionChange(vRealH    *out,vRealD    *in,int nvec)
+{
+  assert((nvec&0x3)==0);
+  for(int m=0;m*4<nvec;m++){
+    int n=m*4;
+    out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
+  }
+}
+inline void precisionChange(vRealH    *out,vRealF    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
+  }
+}
+inline void precisionChange(vRealD    *out,vRealF    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
+  }
+}
+inline void precisionChange(vRealD    *out,vRealH    *in,int nvec)
+{
+  assert((nvec&0x3)==0);
+  for(int m=0;m*4<nvec;m++){
+    int n=m*4;
+    Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
+  }
+}
+inline void precisionChange(vRealF    *out,vRealH    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
+  }
+}
+inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
+inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
+inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
+inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
+inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
+inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
+
+// Check our vector types are of an appropriate size.
+#if defined QPX
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#else
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#endif
+
+/////////////////////////////////////////
+// Some traits to recognise the types
+/////////////////////////////////////////
+template <typename T>
+struct is_simd : public std::false_type {};
+template <> struct is_simd<vRealF>     : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vComplexF>  : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vInteger>   : public std::true_type {};
+
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
+}
+
+#endif

From a9c816a26883056c367ea5972bca41fc1b24b3a3 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 27 Jun 2017 21:39:15 +0200
Subject: [PATCH 4/8] moved file to correct folder

---
 Grid_vector_types.h          | 857 -----------------------------------
 lib/simd/Grid_vector_types.h |  75 +--
 2 files changed, 22 insertions(+), 910 deletions(-)
 delete mode 100644 Grid_vector_types.h

diff --git a/Grid_vector_types.h b/Grid_vector_types.h
deleted file mode 100644
index e05fecc4..00000000
--- a/Grid_vector_types.h
+++ /dev/null
@@ -1,857 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/simd/Grid_vector_type.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-//---------------------------------------------------------------------------
-/*! @file Grid_vector_types.h
-  @brief Defines templated class Grid_simd to deal with inner vector types
-*/
-// Time-stamp: <2015-07-10 17:45:33 neo>
-//---------------------------------------------------------------------------
-#ifndef GRID_VECTOR_TYPES
-#define GRID_VECTOR_TYPES
-
-#ifdef GEN
-#include "Grid_generic.h"
-#endif
-#ifdef SSE4
-#include "Grid_sse4.h"
-#endif
-#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
-#include "Grid_avx.h"
-#endif
-#if defined AVX512
-#include "Grid_avx512.h"
-#endif
-#if defined IMCI
-#include "Grid_imci.h"
-#endif
-#ifdef NEONv8
-#include "Grid_neon.h"
-#endif
-#if defined QPX
-#include "Grid_qpx.h"
-#endif
-
-#include "l1p.h"
-
-namespace Grid {
-
-//////////////////////////////////////
-// To take the floating point type of real/complex type
-//////////////////////////////////////
-template <typename T>
-struct RealPart {
-  typedef T type;
-};
-template <typename T>
-struct RealPart<std::complex<T> > {
-  typedef T type;
-};
-
-#include <type_traits>
-
-//////////////////////////////////////
-// demote a vector to real type
-//////////////////////////////////////
-// type alias used to simplify the syntax of std::enable_if
-template <typename T> using Invoke = typename T::type;
-template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
-template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
-
-////////////////////////////////////////////////////////
-// Check for complexity with type traits
-template <typename T> struct is_complex : public std::false_type {};
-template <> struct is_complex<std::complex<double> > : public std::true_type {};
-template <> struct is_complex<std::complex<float> > : public std::true_type {};
-
-template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
-template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
-template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
-template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
-
-template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
-template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
-template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
-template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
-
-////////////////////////////////////////////////////////
-// Define the operation templates functors
-// general forms to allow for vsplat syntax
-// need explicit declaration of types when used since
-// clang cannot automatically determine the output type sometimes
-template <class Out, class Input1, class Input2, class Input3, class Operation>
-Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
-  return op(src_1, src_2, src_3);
-}
-
-template <class Out, class Input1, class Input2, class Operation>
-Out binary(Input1 src_1, Input2 src_2, Operation op) {
-  return op(src_1, src_2);
-}
-
-template <class Out, class Input, class Operation>
-Out unary(Input src, Operation op) {
-  return op(src);
-}
-///////////////////////////////////////////////
-
-/*
-  @brief Grid_simd class for the SIMD vector type operations
- */
-template <class Scalar_type, class Vector_type>
-class Grid_simd {
- public:
-  typedef typename RealPart<Scalar_type>::type Real;
-  typedef Vector_type vector_type;
-  typedef Scalar_type scalar_type;
-
-  typedef union conv_t_union {
-    Vector_type v;
-    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
-    conv_t_union(){};
-  } conv_t;
-
-  Vector_type v;
-
-  static inline constexpr int Nsimd(void) {
-    return sizeof(Vector_type) / sizeof(Scalar_type);
-  }
-
-  Grid_simd &operator=(const Grid_simd &&rhs) {
-    v = rhs.v;
-    return *this;
-  };
-  Grid_simd &operator=(const Grid_simd &rhs) {
-    v = rhs.v;
-    return *this;
-  };  // faster than not declaring it and leaving to the compiler
-  Grid_simd() = default;
-  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
-  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
-
-  /////////////////////////////
-  // Constructors
-  /////////////////////////////
-  Grid_simd &operator=(Zero &z) {
-    vzero(*this);
-    return (*this);
-  }
-
-  // Enable if complex type
-  template <typename S = Scalar_type>
-  Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
-    vsplat(*this, a);
-  };
-
-  Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
-
-  ///////////////////////////////////////////////
-  // mac, mult, sub, add, adj
-  ///////////////////////////////////////////////
-
-  // FIXME -- alias this to an inline MAC struct.
-  friend inline void mac(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ a,
-                         const Grid_simd *__restrict__ x) {
-    *y = (*a) * (*x) + (*y);
-  };
-
-  friend inline void mult(Grid_simd *__restrict__ y,
-                          const Grid_simd *__restrict__ l,
-                          const Grid_simd *__restrict__ r) {
-    *y = (*l) * (*r);
-  }
-
-  friend inline void sub(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) - (*r);
-  }
-  friend inline void add(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) + (*r);
-  }
-  friend inline void mac(Grid_simd *__restrict__ y,
-                         const Scalar_type *__restrict__ a,
-                         const Grid_simd *__restrict__ x) {
-    *y = (*a) * (*x) + (*y);
-  };
-  friend inline void mult(Grid_simd *__restrict__ y,
-                          const Scalar_type *__restrict__ l,
-                          const Grid_simd *__restrict__ r) {
-    *y = (*l) * (*r);
-  }
-  friend inline void sub(Grid_simd *__restrict__ y,
-                         const Scalar_type *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) - (*r);
-  }
-  friend inline void add(Grid_simd *__restrict__ y,
-                         const Scalar_type *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) + (*r);
-  }
-
-  friend inline void mac(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ a,
-                         const Scalar_type *__restrict__ x) {
-    *y = (*a) * (*x) + (*y);
-  };
-  friend inline void mult(Grid_simd *__restrict__ y,
-                          const Grid_simd *__restrict__ l,
-                          const Scalar_type *__restrict__ r) {
-    *y = (*l) * (*r);
-  }
-  friend inline void sub(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Scalar_type *__restrict__ r) {
-    *y = (*l) - (*r);
-  }
-  friend inline void add(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Scalar_type *__restrict__ r) {
-    *y = (*l) + (*r);
-  }
-
-  ////////////////////////////////////////////////////////////////////////
-  // FIXME:  gonna remove these load/store, get, set, prefetch
-  ////////////////////////////////////////////////////////////////////////
-  friend inline void vset(Grid_simd &ret, Scalar_type *a) {
-    ret.v = unary<Vector_type>(a, VsetSIMD());
-  }
-
-  ///////////////////////
-  // Vstore
-  ///////////////////////
-  friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
-    binary<void>(ret.v, (Real *)a, VstoreSIMD());
-  }
-
-  ///////////////////////
-  // Vprefetch
-  ///////////////////////
-  friend inline void vprefetch(const Grid_simd &v) {
-    prefetch_HINT_T0((const char *)&v.v);
-  }
-
-  ///////////////////////
-  // Reduce
-  ///////////////////////
-  friend inline Scalar_type Reduce(const Grid_simd &in) {
-    return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
-  }
-
-  ////////////////////////////
-  // operator scalar * simd
-  ////////////////////////////
-  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
-    Grid_simd va;
-    vsplat(va, a);
-    return va * b;
-  }
-  friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
-    return a * b;
-  }
-
-  //////////////////////////////////
-  // Divides
-  //////////////////////////////////
-  friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
-    Grid_simd va;
-    vsplat(va, a);
-    return va / b;
-  }
-  friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
-    Grid_simd va;
-    vsplat(va, a);
-    return b / a;
-  }
-
-  ///////////////////////
-  // Unary negation
-  ///////////////////////
-  friend inline Grid_simd operator-(const Grid_simd &r) {
-    Grid_simd ret;
-    vzero(ret);
-    ret = ret - r;
-    return ret;
-  }
-  // *=,+=,-= operators
-  inline Grid_simd &operator*=(const Grid_simd &r) {
-    *this = (*this) * r;
-    return *this;
-    // return (*this)*r; ?
-  }
-  inline Grid_simd &operator+=(const Grid_simd &r) {
-    *this = *this + r;
-    return *this;
-  }
-  inline Grid_simd &operator-=(const Grid_simd &r) {
-    *this = *this - r;
-    return *this;
-  }
-
-  ///////////////////////////////////////
-  // Not all functions are supported
-  // through SIMD and must breakout to
-  // scalar type and back again. This
-  // provides support
-  ///////////////////////////////////////
-
-  template <class functor>
-  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
-    Grid_simd ret;
-    Grid_simd::conv_t conv;
-    Grid_simd::scalar_type s;
-    
-    conv.v = v.v;
-    for (int i = 0; i < Nsimd(); i++) {
-      s = conv.s[i];
-      conv.s[i] = func(s);
-    }
-    ret.v = conv.v;
-    return ret;
-  }
-  template <class functor>
-  friend inline Grid_simd SimdApplyBinop(const functor &func,
-                                         const Grid_simd &x,
-                                         const Grid_simd &y) {
-    Grid_simd ret;
-    Grid_simd::conv_t cx;
-    Grid_simd::conv_t cy;
-    Grid_simd::scalar_type sx,sy;
-
-    cx.v = x.v;
-    cy.v = y.v;
-    for (int i = 0; i < Nsimd(); i++) {
-      sx = cx.s[i];
-      sy = cy.s[i];
-      cx.s[i] = func(sx,sy);
-    }
-    ret.v = cx.v;
-    return ret;
-  }
-  ///////////////////////
-  // Exchange 
-  // Al Ah , Bl Bh -> Al Bl Ah,Bh
-  ///////////////////////
-  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
-  {
-    if       (n==3) {
-      Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==2) {
-      Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==1) {
-      Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) { 
-      Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////
-  // General permute; assumes vector length is same across
-  // all subtypes; may not be a good assumption, but could
-  // add the vector width as a template param for BG/Q for example
-  ////////////////////////////////////////////////////////////////////
-  friend inline void permute0(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute0(b.v);
-  }
-  friend inline void permute1(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute1(b.v);
-  }
-  friend inline void permute2(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute2(b.v);
-  }
-  friend inline void permute3(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute3(b.v);
-  }
-  friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
-    if (perm & RotateBit) {
-      int dist = perm & 0xF;
-      y = rotate(b, dist);
-      return;
-    } 
-    else if(perm==3) permute3(y, b);
-    else if(perm==2) permute2(y, b);
-    else if(perm==1) permute1(y, b);
-    else if(perm==0) permute0(y, b);
-  }
-
-  ///////////////////////////////
-  // Getting single lanes
-  ///////////////////////////////
-  inline Scalar_type getlane(int lane) {
-    return ((Scalar_type*)&v)[lane];
-  }
-
-  inline void putlane(const Scalar_type &S, int lane){
-    ((Scalar_type*)&v)[lane] = S;
-  }
-
-
-  
-};  // end of Grid_simd class definition
-
-inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
-inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
-inline void permute(RealD &y,RealD b, int perm) {  y=b; }
-inline void permute(RealF &y,RealF b, int perm) {  y=b; }
-
-////////////////////////////////////////////////////////////////////
-// General rotate
-////////////////////////////////////////////////////////////////////
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
-  nrot = nrot % Grid_simd<S, V>::Nsimd();
-  Grid_simd<S, V> ret;
-  ret.v = Optimization::Rotate::rotate(b.v, nrot);
-  return ret;
-}
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
-  nrot = nrot % Grid_simd<S, V>::Nsimd();
-  Grid_simd<S, V> ret;
-  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> =0> 
-inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
-{
-  nrot = nrot % Grid_simd<S,V>::Nsimd();
-  ret.v = Optimization::Rotate::rotate(b.v,nrot);
-}
-template <class S, class V, IfComplex<S> =0> 
-inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
-{
-  nrot = nrot % Grid_simd<S,V>::Nsimd();
-  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
-}
-
-template <class S, class V> 
-inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
-  S* typepun =(S*) &src;
-  vsplat(ret,typepun[lane]);
-}    
-template <class S, class V, IfComplex<S> =0> 
-inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
-  S* typepun =(S*) &src;
-  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}    
-
-
-
-///////////////////////
-// Splat
-///////////////////////
-
-// this is only for the complex version
-template <class S, class V, IfComplex<S> = 0, class ABtype>
-inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
-  ret.v = binary<V>(a, b, VsplatSIMD());
-}
-
-// overload if complex
-template <class S, class V>
-inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
-  vsplat(ret, real(c), imag(c));
-}
-template <class S, class V>
-inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
-  vsplat(ret, real(c), real(c));
-}
-
-// if real fill with a, if complex fill with a in the real part (first function
-// above)
-template <class S, class V>
-inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
-  ret.v = unary<V>(a, VsplatSIMD());
-}
-//////////////////////////
-
-///////////////////////////////////////////////
-// Initialise to 1,0,i for the correct types
-///////////////////////////////////////////////
-// For complex types
-template <class S, class V, IfComplex<S> = 0>
-inline void vone(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(1.0, 0.0));
-}
-template <class S, class V, IfComplex<S> = 0>
-inline void vzero(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(0.0, 0.0));
-}  // use xor?
-template <class S, class V, IfComplex<S> = 0>
-inline void vcomplex_i(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(0.0, 1.0));
-}
-
-template <class S, class V, IfComplex<S> = 0>
-inline void visign(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(1.0, -1.0));
-}
-template <class S, class V, IfComplex<S> = 0>
-inline void vrsign(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(-1.0, 1.0));
-}
-
-// if not complex overload here
-template <class S, class V, IfReal<S> = 0>
-inline void vone(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(1.0));
-}
-template <class S, class V, IfReal<S> = 0>
-inline void vzero(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(0.0));
-}
-
-// For integral types
-template <class S, class V, IfInteger<S> = 0>
-inline void vone(Grid_simd<S, V> &ret) {
-  vsplat(ret, 1);
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vzero(Grid_simd<S, V> &ret) {
-  vsplat(ret, 0);
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vtrue(Grid_simd<S, V> &ret) {
-  vsplat(ret, 0xFFFFFFFF);
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vfalse(Grid_simd<S, V> &ret) {
-  vsplat(ret, 0);
-}
-template <class S, class V>
-inline void zeroit(Grid_simd<S, V> &z) {
-  vzero(z);
-}
-
-///////////////////////
-// Vstream
-///////////////////////
-template <class S, class V, IfReal<S> = 0>
-inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
-  binary<void>((S *)&out.v, in.v, VstreamSIMD());
-}
-template <class S, class V, IfComplex<S> = 0>
-inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
-  typedef typename S::value_type T;
-  binary<void>((T *)&out.v, in.v, VstreamSIMD());
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
-  out = in;
-}
-
-////////////////////////////////////
-// Arithmetic operator overloads +,-,*
-////////////////////////////////////
-template <class S, class V>
-inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, SumSIMD());
-  return ret;
-};
-
-template <class S, class V>
-inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, SubSIMD());
-  return ret;
-};
-
-// Distinguish between complex types and others
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
-  return ret;
-};
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
-  Grid_simd<S, V> ret;
-  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
-  return ret;
-};
-
-
-// Distinguish between complex types and others
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
-  return ret;
-};
-
-// Real/Integer types
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, MultSIMD());
-  return ret;
-};
-
-// Distinguish between complex types and others
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  typedef Grid_simd<S, V> simd;
-
-  simd ret;
-  simd den;
-  typename simd::conv_t conv;
-
-  ret = a * conjugate(b) ;
-  den = b * conjugate(b) ;
-
-  
-  auto real_den = toReal(den);
-
-  ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
-
-  return ret;
-};
-
-// Real/Integer types
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, DivSIMD());
-  return ret;
-};
-
-///////////////////////
-// Conjugate
-///////////////////////
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
-  Grid_simd<S, V> ret;
-  ret.v = unary<V>(in.v, ConjSIMD());
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
-  return in;  // for real objects
-}
-// Suppress adj for integer types... // odd; why conjugate above but not adj??
-template <class S, class V, IfNotInteger<S> = 0>
-inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
-  return conjugate(in);
-}
-
-///////////////////////
-// timesMinusI
-///////////////////////
-template <class S, class V, IfComplex<S> = 0>
-inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
-  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
-}
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
-  Grid_simd<S, V> ret;
-  timesMinusI(ret, in);
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
-  return in;
-}
-
-///////////////////////
-// timesI
-///////////////////////
-template <class S, class V, IfComplex<S> = 0>
-inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
-  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
-}
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
-  Grid_simd<S, V> ret;
-  timesI(ret, in);
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
-  return in;
-}
-
-/////////////////////
-// Inner, outer
-/////////////////////
-
-template <class S, class V>
-inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
-                                    const Grid_simd<S, V> &r) {
-  return conjugate(l) * r;
-}
-template <class S, class V>
-inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
-                                    const Grid_simd<S, V> &r) {
-  return l * conjugate(r);
-}
-
-template <class S, class V>
-inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
-  return arg;
-}
-
-////////////////////////////////////////////////////////////
-// copy/splat complex real parts into real;
-// insert real into complex and zero imag;
-////////////////////////////////////////////////////////////
-
-// real = toReal( complex )
-template <class S, class V, IfReal<S> = 0>
-inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
-  typedef Grid_simd<S, V> simd;
-  simd ret;
-  typename simd::conv_t conv;
-  conv.v = in.v;  // copy the vector content (bytewise)
-  for (int i = 0; i < simd::Nsimd(); i += 2) {
-    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
-  }
-  ret.v = conv.v;
-  return ret;
-}
-
-// complex = toComplex( real )
-template <class R, class V, IfReal<R> = 0>  // must be a real arg
-inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
-  typedef Grid_simd<R, V> Rsimd;
-  typedef Grid_simd<std::complex<R>, V> Csimd;
-  typename Rsimd::conv_t conv;  // address as real
-
-  conv.v = in.v;
-  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] == conv.s[i]);  
-    // trap any cases where real was not duplicated
-    // indicating the SIMD grids of real and imag assignment did not correctly
-    // match
-    conv.s[i + 1] = 0.0;  // zero imaginary parts
-  }
-  Csimd ret;
-  ret.v = conv.v;
-  return ret;
-}
-
-///////////////////////////////
-// Define available types
-///////////////////////////////
-typedef Grid_simd<float, SIMD_Ftype> vRealF;
-typedef Grid_simd<double, SIMD_Dtype> vRealD;
-typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
-typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
-typedef Grid_simd<Integer, SIMD_Itype> vInteger;
-
-// Half precision; no arithmetic support
-typedef Grid_simd<uint16_t, SIMD_Htype>               vRealH;
-typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
-
-inline void precisionChange(vRealF    *out,vRealD    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
-  }
-}
-inline void precisionChange(vRealH    *out,vRealD    *in,int nvec)
-{
-  assert((nvec&0x3)==0);
-  for(int m=0;m*4<nvec;m++){
-    int n=m*4;
-    out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
-  }
-}
-inline void precisionChange(vRealH    *out,vRealF    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
-  }
-}
-inline void precisionChange(vRealD    *out,vRealF    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
-  }
-}
-inline void precisionChange(vRealD    *out,vRealH    *in,int nvec)
-{
-  assert((nvec&0x3)==0);
-  for(int m=0;m*4<nvec;m++){
-    int n=m*4;
-    Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
-  }
-}
-inline void precisionChange(vRealF    *out,vRealH    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
-  }
-}
-inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
-inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
-inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
-inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
-inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
-inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
-
-// Check our vector types are of an appropriate size.
-#if defined QPX
-static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
-static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
-#else
-static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
-static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
-#endif
-
-/////////////////////////////////////////
-// Some traits to recognise the types
-/////////////////////////////////////////
-template <typename T>
-struct is_simd : public std::false_type {};
-template <> struct is_simd<vRealF>     : public std::true_type {};
-template <> struct is_simd<vRealD>     : public std::true_type {};
-template <> struct is_simd<vComplexF>  : public std::true_type {};
-template <> struct is_simd<vComplexD>  : public std::true_type {};
-template <> struct is_simd<vInteger>   : public std::true_type {};
-
-template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
-template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
-}
-
-#endif
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 424b5573..e05fecc4 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -327,16 +327,12 @@ class Grid_simd {
   // provides support
   ///////////////////////////////////////
 
-  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC push_options
-  //#pragma GCC optimize ("O0")
-  //#endif
   template <class functor>
   friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
     Grid_simd ret;
     Grid_simd::conv_t conv;
     Grid_simd::scalar_type s;
-
+    
     conv.v = v.v;
     for (int i = 0; i < Nsimd(); i++) {
       s = conv.s[i];
@@ -364,11 +360,8 @@ class Grid_simd {
     ret.v = cx.v;
     return ret;
   }
-  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC pop_options
-  //#endif
   ///////////////////////
-  // Exchange
+  // Exchange 
   // Al Ah , Bl Bh -> Al Bl Ah,Bh
   ///////////////////////
   friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
@@ -379,7 +372,7 @@ class Grid_simd {
       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
     } else if(n==1) {
       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) {
+    } else if(n==0) { 
       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
     }
   }
@@ -406,7 +399,7 @@ class Grid_simd {
       int dist = perm & 0xF;
       y = rotate(b, dist);
       return;
-    }
+    } 
     else if(perm==3) permute3(y, b);
     else if(perm==2) permute2(y, b);
     else if(perm==1) permute1(y, b);
@@ -425,10 +418,9 @@ class Grid_simd {
   }
 
 
-
+  
 };  // end of Grid_simd class definition
 
-
 inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
 inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
 inline void permute(RealD &y,RealD b, int perm) {  y=b; }
@@ -451,29 +443,29 @@ inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
   return ret;
 }
-template <class S, class V, IfNotComplex<S> =0>
+template <class S, class V, IfNotComplex<S> =0> 
 inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,nrot);
 }
-template <class S, class V, IfComplex<S> =0>
+template <class S, class V, IfComplex<S> =0> 
 inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 }
 
-template <class S, class V>
+template <class S, class V> 
 inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   vsplat(ret,typepun[lane]);
-}
-template <class S, class V, IfComplex<S> =0>
+}    
+template <class S, class V, IfComplex<S> =0> 
 inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}
+}    
 
 
 
@@ -604,27 +596,13 @@ inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
   return ret;
 };
-// TEST for Test_simd
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_mult(std::complex<S> a, std::complex<S> b) {
-  Grid_simd<S, V> ret;
-  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
-  return ret;
-};
-
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
   Grid_simd<S, V> ret;
   ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
   return ret;
 };
-// TEST for Test_simd
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_madd(std::complex<S> a, std::complex<S> b) {
-  Grid_simd<S, V> ret;
-  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
-  return ret;
-};
+
 
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
@@ -654,7 +632,7 @@ inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret = a * conjugate(b) ;
   den = b * conjugate(b) ;
 
-
+  
   auto real_den = toReal(den);
 
   ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
@@ -773,8 +751,8 @@ inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
 
   conv.v = in.v;
   for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] ==
-           conv.s[i]);  // trap any cases where real was not duplicated
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
     // indicating the SIMD grids of real and imag assignment did not correctly
     // match
     conv.s[i + 1] = 0.0;  // zero imaginary parts
@@ -852,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha
 inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
 inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
 
-
-
 // Check our vector types are of an appropriate size.
 #if defined QPX
 static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
@@ -868,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 /////////////////////////////////////////
 template <typename T>
 struct is_simd : public std::false_type {};
-template <>
-struct is_simd<vRealF> : public std::true_type {};
-template <>
-struct is_simd<vRealD> : public std::true_type {};
-template <>
-struct is_simd<vComplexF> : public std::true_type {};
-template <>
-struct is_simd<vComplexD> : public std::true_type {};
-template <>
-struct is_simd<vInteger> : public std::true_type {};
+template <> struct is_simd<vRealF>     : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vComplexF>  : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vInteger>   : public std::true_type {};
 
-template <typename T>
-using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
-template <typename T>
-using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 }
 
 #endif

From 0933aeefd4ad053015c4e6bf232df235c546047a Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Wed, 28 Jun 2017 20:22:22 +0200
Subject: [PATCH 5/8] corrected Grid_neon.h

---
 lib/simd/Grid_neon.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index f3f802e7..38815389 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -41,6 +41,10 @@ Author: neo <cossu@post.kek.jp>
 //#ifndef ARM_NEON
 //#define ARM_NEON
 
+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 16u
+#endif
+
 #include "Grid_generic_types.h"
 #include <arm_neon.h>
 

From 8859a151cc844b8170729285dc2a272e4ffa4940 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:30:29 +0100
Subject: [PATCH 6/8] Small corrections to the NEON port

---
 configure.ac                  |  2 +-
 lib/qcd/smearing/WilsonFlow.h |  9 ++++-----
 lib/simd/Grid_neon.h          | 15 +++++----------
 lib/simd/Grid_vector_types.h  |  2 +-
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/configure.ac b/configure.ac
index a69b97e3..75cf7891 100644
--- a/configure.ac
+++ b/configure.ac
@@ -250,7 +250,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       NEONv8)
         AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
-        SIMD_FLAGS='';;
+        SIMD_FLAGS='-march=armv8-a';;
       QPX|BGQ)
         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
         SIMD_FLAGS='';;
diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h
index 5e9f2d95..4f5c0d43 100644
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
     if (maxTau - taus < epsilon){
         epsilon = maxTau-taus;
     }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
     GaugeField Z(U._grid);
     GaugeField Zprime(U._grid);
     GaugeField tmp(U._grid), Uprime(U._grid);
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
     // adjust integration step
     
     taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
     
     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 
 }
 
@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
     out = in;
     for (unsigned int step = 1; step <= Nstep; step++) {
         auto start = std::chrono::high_resolution_clock::now();
-        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
         evolve_step(out);
         auto end = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> diff = end - start;
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
     unsigned int step = 0;
     do{
         step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
         evolve_step_adaptive(out, maxTau);
         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
             << step << "  "
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 38815389..d6eb9c5a 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -6,9 +6,9 @@
 
     Copyright (C) 2015
 
-Author: Nils Meyer <nils.meyer@ur.de>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -27,7 +27,7 @@ Author: neo <cossu@post.kek.jp>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-//----------------------------------------------------------------------
+
 /*
 
   ARMv8 NEON intrinsics layer by
@@ -37,9 +37,6 @@ Author: neo <cossu@post.kek.jp>
   SFB/TRR55
 
 */
-//----------------------------------------------------------------------
-//#ifndef ARM_NEON
-//#define ARM_NEON
 
 #ifndef GEN_SIMD_WIDTH
 #define GEN_SIMD_WIDTH 16u
@@ -606,6 +603,4 @@ namespace Optimization {
   typedef Optimization::TimesMinusI TimesMinusISIMD;
   typedef Optimization::TimesI      TimesISIMD;
 
-}
-
-//#endif // ARM_NEON
+}
\ No newline at end of file
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index e05fecc4..27585547 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX

From bf630a6821ea8923fc9690a03f621f6d69b31f4e Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:42:25 +0100
Subject: [PATCH 7/8] README file update

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9432abe1..5d168298 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 License: GPL v2.
 
-Last update Nov 2016.
+Last update June 2017.
 
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 
@@ -78,14 +78,17 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
 
+### Required libraries
+Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
+
 ### Quick start
 First, start by cloning the repository:
 
@@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | ARM NEON (128 bit)                     |
+| `QPX`       | IBM QPX (256 bit)                      |
 
 Alternatively, some CPU codenames can be directly used:
 

From 09d09d0fe5bce853e1b42115371cd935a4e29cc0 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:48:11 +0100
Subject: [PATCH 8/8] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5d168298..1f0b450c 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 
 MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see https://arxiv.org/abs/1512.03487 for more detail.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 
 ### Required libraries
 Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
@@ -176,7 +176,7 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `NEONv8`    | ARM NEON (128 bit)                     |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 
 Alternatively, some CPU codenames can be directly used:
@@ -216,4 +216,4 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are w
              --with-mpfr=<path>       \
              --enable-mkl             \
              CXX=CC CC=cc
-```
\ No newline at end of file
+```