From 3d04dc33c6c64f2ed761ef62094519a81668aaee Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 13 Jun 2017 13:26:59 +0200
Subject: [PATCH 01/44] ARM neon intrinsics support

---
 configure.ac                  |   3 +
 lib/simd/Grid_generic_types.h |   2 +-
 lib/simd/Grid_neon.h          | 460 +++++++++++++++++++++++++++-------
 lib/simd/Grid_vector_types.h  |  44 ++--
 4 files changed, 396 insertions(+), 113 deletions(-)
diff --git a/configure.ac b/configure.ac
index 62b7545b..20f71128 100644
--- a/configure.ac
+++ b/configure.ac
@@ -244,6 +244,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                            [generic SIMD vector width (in bytes)])
         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
         SIMD_FLAGS='';;
+      NEONv8)
+        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
+        SIMD_FLAGS='';;
       QPX|BGQ)
         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
         SIMD_FLAGS='';;
diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h
index 642f6ffe..eac65e09 100644
--- a/lib/simd/Grid_generic_types.h
+++ b/lib/simd/Grid_generic_types.h
@@ -26,7 +26,7 @@ Author: Antonin Portelli <antonin.portelli@me.com>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-
+#define GEN_SIMD_WIDTH 16
 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
 
 //#define VECTOR_LOOPS
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 7c1ad443..f3f802e7 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,11 +1,12 @@
-    /*************************************************************************************
+/*************************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
 
     Source file: ./lib/simd/Grid_neon.h
 
     Copyright (C) 2015
 
+Author: Nils Meyer <nils.meyer@ur.de>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 
@@ -27,18 +28,23 @@ Author: neo <cossu@post.kek.jp>
     *************************************************************************************/
     /*  END LEGAL */
 //----------------------------------------------------------------------
-/*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
+/*
+
+  ARMv8 NEON intrinsics layer by
+
+  Nils Meyer <nils.meyer@ur.de>,
+  University of Regensburg, Germany
+  SFB/TRR55
 
-  Experimental - Using intrinsics - DEVELOPING! 
 */
-// Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
+//#ifndef ARM_NEON
+//#define ARM_NEON
 
+#include "Grid_generic_types.h"
 #include <arm_neon.h>
 
-// ARMv8 supports double precision
-
+namespace Grid {
 namespace Optimization {
 
   template<class vtype>
@@ -46,16 +52,20 @@ namespace Optimization {
     float32x4_t f;
     vtype v;
   };
-
   union u128f {
     float32x4_t v;
     float f[4];
   };
   union u128d {
     float64x2_t v;
-    double f[4];
+    double f[2];
   };
-  
+  // half precision
+  union u128h {
+    float16x8_t v;
+    uint16_t f[8];
+  };
+
   struct Vsplat{
     //Complex float
     inline float32x4_t operator()(float a, float b){
@@ -64,31 +74,31 @@ namespace Optimization {
     }
     // Real float
     inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
     }
     //Complex double
-    inline float32x4_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
-      return vld1q_f32(tmp);
+    inline float64x2_t operator()(double a, double b){
+      double tmp[2]={a,b};
+      return vld1q_f64(tmp);
     }
-    //Real double
-    inline float32x4_t operator()(double a){
-      return vld1q_dup_f32(&a);
+    //Real double // N:tbc
+    inline float64x2_t operator()(double a){
+      return vdupq_n_f64(a);
     }
-    //Integer
+    //Integer // N:tbc
     inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
     }
   };
 
   struct Vstore{
-    //Float 
+    //Float
     inline void operator()(float32x4_t a, float* F){
       vst1q_f32(F, a);
     }
     //Double
-    inline void operator()(float32x4_t a, double* D){
-      vst1q_f32((float*)D, a);
+    inline void operator()(float64x2_t a, double* D){
+      vst1q_f64(D, a);
     }
     //Integer
     inline void operator()(uint32x4_t a, Integer* I){
@@ -97,54 +107,54 @@ namespace Optimization {
 
   };
 
-  struct Vstream{
-    //Float
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+    //Float // N:generic
     inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
     }
-    //Double
-    inline void operator()(double * a, float32x4_t b){
-  
+    //Double // N:generic
+    inline void operator()(double * a, float64x2_t b){
+      memcpy(a,&b,2*sizeof(double));
     }
 
 
   };
 
+  // Nils: Vset untested; not used currently in Grid at all;
+  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
   struct Vset{
-    // Complex float 
+    // Complex float // N:ok
     inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
+      return vld1q_f32(tmp);
     }
-    // Complex double 
-    inline float32x4_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
-      return foo;
+    // Complex double // N:ok
+    inline float64x2_t operator()(Grid::ComplexD *a){
+      double tmp[2]={a[0].imag(),a[0].real()};
+      return vld1q_f64(tmp);
     }
-    // Real float 
+    // Real float // N:ok
     inline float32x4_t operator()(float *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
+      return vld1q_f32(tmp);
     }
-    // Real double
-    inline float32x4_t operator()(double *a){
-      float32x4_t foo;
-      return foo;
+    // Real double // N:ok
+    inline float64x2_t operator()(double *a){
+      double tmp[2]={a[1],a[0]};
+      return vld1q_f64(tmp);
     }
-    // Integer
+    // Integer // N:ok
     inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(a);
     }
-
-
   };
 
+  // N:leaving as is
   template <typename Out_type, typename In_type>
   struct Reduce{
     //Need templated class to overload output type
     //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
       printf("Error, using wrong Reduce function\n");
       exit(1);
       return 0;
@@ -184,26 +194,98 @@ namespace Optimization {
     }
   };
 
+  struct MultRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vmulq_f32(re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vmulq_f64(re, b);
+    }
+  };
+
+  struct MaddRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vfmaq_f32(c, re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vfmaq_f64(c, re, b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vdivq_f32(a, b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vdivq_f64(a, b);
+    }
+  };
+
   struct MultComplex{
     // Complex float
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+
+      float32x4_t r0, r1, r2, r3, r4;
+
+      // a = ar ai Ar Ai
+      // b = br bi Br Bi
+      // collect real/imag part, negate bi and Bi
+      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
+      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
+      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
+
+      // the fun part
+      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
+      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
+
+      // no fma, use mul and add
+      //float32x4_t r5;
+      //r5 = vmulq_f32(r0, a);
+      //return vaddq_f32(r4, r5);
     }
     // Complex double
     inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
-      return foo;
+
+      float64x2_t r0, r1, r2, r3, r4;
+
+      // b = br bi
+      // collect real/imag part, negate bi
+      r0 = vtrn1q_f64(b, b);       //  br  br
+      r1 = vnegq_f64(b);           // -br -bi
+      r2 = vtrn2q_f64(b, r1);      //  bi -bi
+
+      // the fun part
+      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
+      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
+
+      // no fma, use mul and add
+      //float64x2_t r5;
+      //r5 = vmulq_f64(r0, a);
+      //return vaddq_f64(r4, r5);
     }
   };
 
   struct Mult{
     // Real float
     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
+      return vfmaq_f32(a, b, c);
     }
     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
+      return vfmaq_f64(a, b, c);
     }
     inline float32x4_t operator()(float32x4_t a, float32x4_t b){
       return vmulq_f32(a,b);
@@ -221,89 +303,275 @@ namespace Optimization {
   struct Conj{
     // Complex single
     inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
     }
     // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
-    // return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in){
+
+      float64x2_t r0, r1;
+      r0 = vextq_f64(in, in, 1);    //  ai  ar
+      r1 = vnegq_f64(r0);           // -ai -ar
+      return vextq_f64(r0, r1, 1);  //  ar -ai
+    }
     // do not define for integer input
   };
 
   struct TimesMinusI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
+      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
     }
     //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return in;
-    //}
-
-
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> b -ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(in, tmp, 1);
+    }
   };
 
   struct TimesI{
     //Complex single
     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
-      return in;
+      // ar ai br bi -> -ai ar -bi br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
     }
     //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> -b ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(tmp, in, 1);
+    }
+  };
+
+  struct Permute{
+
+    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
+      // AB CD -> CD AB
+      return vextq_f32(in, in, 2);
+    };
+    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
+      // AB CD -> BA DC
+      return vrev64q_f32(in);
+    };
+    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+
+    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
+      // AB -> BA
+      return vextq_f64(in, in, 1);
+    };
+    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+
+  };
+
+  struct Rotate{
+
+    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB CD -> AB CD
+        return tRotate<0>(in);
+        break;
+      case 1: // AB CD -> BC DA
+        return tRotate<1>(in);
+        break;
+      case 2: // AB CD -> CD AB
+        return tRotate<2>(in);
+        break;
+      case 3: // AB CD -> DA BC
+        return tRotate<3>(in);
+        break;
+      default: assert(0);
+      }
+    }
+    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB -> AB
+        return tRotate<0>(in);
+        break;
+      case 1: // AB -> BA
+        return tRotate<1>(in);
+        break;
+      default: assert(0);
+      }
+    }
+
+// working, but no restriction on n
+//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
+//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
+
+// restriction on n
+    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
+    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
+
+  };
+
+  struct PrecisionChange {
+
+    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
+      float16x4_t h = vcvt_f16_f32(a);
+      return vcvt_high_f16_f32(h, b);
+    }
+    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
+      sb = vcvt_high_f32_f16(h);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
+      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // workaround for clang
+      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
+      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
+      sa = vcvt_high_f32_f16(h1);
+    }
+    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
+      float32x2_t s = vcvt_f32_f64(a);
+      return vcvt_high_f32_f64(s, b);
+
+    }
+    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
+      b = vcvt_high_f64_f32(s);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      float32x4_t s1 = vextq_f32(s, s, 2);
+      a = vcvt_high_f64_f32(s1);
+
+    }
+    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
+      float32x4_t s1 = DtoS(a, b);
+      float32x4_t s2 = DtoS(c, d);
+      return StoH(s1, s2);
+    }
+    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
+      float32x4_t s1, s2;
+      HtoS(h, s1, s2);
+      StoD(s1, a, b);
+      StoD(s2, c, d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+
+  struct Exchange{
+    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: ABEF
+      // in2: EFGH -> out2: CDGH
+
+      // z: CDAB
+      float32x4_t z = vextq_f32(in1, in1, 2);
+      // out1: ABEF
+      out1 = vextq_f32(z, in2, 2);
+
+      // z: GHEF
+      z = vextq_f32(in2, in2, 2);
+      // out2: CDGH
+      out2 = vextq_f32(in1, z, 2);
+    };
+
+    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: AECG
+      // in2: EFGH -> out2: BFDH
+      out1 = vtrn1q_f32(in1, in2);
+      out2 = vtrn2q_f32(in1, in2);
+    };
+    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    // double precision
+    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      // in1: AB -> out1: AC
+      // in2: CD -> out2: BD
+      out1 = vzip1q_f64(in1, in2);
+      out2 = vzip2q_f64(in1, in2);
+    };
+    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
   };
 
   //////////////////////////////////////////////
   // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {
 
-  }; 
 
   //Complex float Reduce
   template<>
   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
+    v1 = Optimization::Permute::Permute0(in);
+    v1 = vaddq_f32(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
   }
   //Real float Reduce
   template<>
   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
-    float32x2_t low = vget_low_f32(in);
-    float32x2_t tmp = vadd_f32(low, high);
-    float32x2_t sum = vpadd_f32(tmp, tmp);
-    return vget_lane_f32(sum,0);
+    return vaddvq_f32(in);
   }
-  
-  
+
+
   //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
   }
-  
+
   //Real double Reduce
   template<>
   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
-    return vgetq_lane_f64(sum,0);
+    return vaddvq_f64(in);
   }
 
   //Integer Reduce
   template<>
   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
     // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
     assert(0);
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
+// Here assign types
 
+// typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef float16x8_t  SIMD_Htype; // Half precision type
   typedef float32x4_t  SIMD_Ftype; // Single precision type
   typedef float64x2_t  SIMD_Dtype; // Double precision type
   typedef uint32x4_t   SIMD_Itype; // Integer type
@@ -312,13 +580,6 @@ namespace Grid {
   inline void prefetch_HINT_T0(const char *ptr){};
 
 
-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
-
   // Function name aliases
   typedef Optimization::Vsplat   VsplatSIMD;
   typedef Optimization::Vstore   VstoreSIMD;
@@ -326,16 +587,21 @@ namespace Grid {
   typedef Optimization::Vstream  VstreamSIMD;
   template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 
- 
+
 
 
   // Arithmetic operations
   typedef Optimization::Sum         SumSIMD;
   typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
   typedef Optimization::Mult        MultSIMD;
   typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
   typedef Optimization::Conj        ConjSIMD;
   typedef Optimization::TimesMinusI TimesMinusISIMD;
   typedef Optimization::TimesI      TimesISIMD;
 
 }
+
+//#endif // ARM_NEON
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 0048382f..424b5573 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -328,15 +328,15 @@ class Grid_simd {
   ///////////////////////////////////////
 
   //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC push_options 
-  //#pragma GCC optimize ("O0") 
+  //#pragma GCC push_options
+  //#pragma GCC optimize ("O0")
   //#endif
   template <class functor>
   friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
     Grid_simd ret;
     Grid_simd::conv_t conv;
     Grid_simd::scalar_type s;
-    
+
     conv.v = v.v;
     for (int i = 0; i < Nsimd(); i++) {
       s = conv.s[i];
@@ -368,7 +368,7 @@ class Grid_simd {
   //#pragma GCC pop_options
   //#endif
   ///////////////////////
-  // Exchange 
+  // Exchange
   // Al Ah , Bl Bh -> Al Bl Ah,Bh
   ///////////////////////
   friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
@@ -379,7 +379,7 @@ class Grid_simd {
       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
     } else if(n==1) {
       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) { 
+    } else if(n==0) {
       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
     }
   }
@@ -406,7 +406,7 @@ class Grid_simd {
       int dist = perm & 0xF;
       y = rotate(b, dist);
       return;
-    } 
+    }
     else if(perm==3) permute3(y, b);
     else if(perm==2) permute2(y, b);
     else if(perm==1) permute1(y, b);
@@ -425,7 +425,7 @@ class Grid_simd {
   }
 
 
-  
+
 };  // end of Grid_simd class definition
 
 
@@ -451,29 +451,29 @@ inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
   return ret;
 }
-template <class S, class V, IfNotComplex<S> =0> 
+template <class S, class V, IfNotComplex<S> =0>
 inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,nrot);
 }
-template <class S, class V, IfComplex<S> =0> 
+template <class S, class V, IfComplex<S> =0>
 inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 }
 
-template <class S, class V> 
+template <class S, class V>
 inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   vsplat(ret,typepun[lane]);
-}    
-template <class S, class V, IfComplex<S> =0> 
+}
+template <class S, class V, IfComplex<S> =0>
 inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}    
+}
 
 
 
@@ -604,13 +604,27 @@ inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
   return ret;
 };
+// TEST for Test_simd
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(std::complex<S> a, std::complex<S> b) {
+  Grid_simd<S, V> ret;
+  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
   Grid_simd<S, V> ret;
   ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
   return ret;
 };
-
+// TEST for Test_simd
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(std::complex<S> a, std::complex<S> b) {
+  Grid_simd<S, V> ret;
+  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
 
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
@@ -640,7 +654,7 @@ inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret = a * conjugate(b) ;
   den = b * conjugate(b) ;
 
-  
+
   auto real_den = toReal(den);
 
   ret.v=binary<V>(ret.v, real_den.v, DivSIMD());

From d2e8372df3c0a39b9eb2c000c7f190c670a75501 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 24 Jun 2017 23:03:39 +0100
Subject: [PATCH 02/44] SU(N) algebra fix (was not working)

---
 lib/qcd/utils/SUn.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h
index 99a620bc..8f0c0a7b 100644
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
 
     for (int a = 0; a < AdjointDimension; a++) {
       generator(a, Ta);
-      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
-      pokeColour(h_out, tmp, a);
+      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
     }
   }
 

From 0af740dc1521656ee549094fea038176791d6cac Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 24 Jun 2017 23:04:05 +0100
Subject: [PATCH 03/44] minor scalar HMC code improvement

---
 lib/qcd/action/scalar/ScalarImpl.h              | 8 +++++---
 lib/qcd/action/scalar/ScalarInteractionAction.h | 2 +-
 lib/qcd/hmc/HMC.h                               | 2 +-
 lib/qcd/hmc/HMCResourceManager.h                | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index 5342a1fa..174553a2 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -93,6 +93,8 @@ class ScalarImplTypes {
   class ScalarAdjMatrixImplTypes {
   public:
     typedef S Simd;
+    typedef QCD::SU<N> Group;
+    
     template <typename vtype>
     using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
     template <typename vtype>
@@ -108,7 +110,7 @@ class ScalarImplTypes {
     typedef Field                PropagatorField;
 
     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
-      QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
     }
 
     static inline Field projectForce(Field& P) {return P;}
@@ -122,11 +124,11 @@ class ScalarImplTypes {
     }
 
     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      QCD::SU<N>::LieRandomize(pRNG, U);
+      Group::LieRandomize(pRNG, U);
     }
 
     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      QCD::SU<N>::LieRandomize(pRNG, U, 0.01);
+      Group::LieRandomize(pRNG, U, 0.01);
     }
 
     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index 5f4c630c..1ff8fd37 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -98,7 +98,7 @@ namespace Grid {
 	      permute(temp2, *temp, permute_type);
 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 	    } else {
-	      action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
+	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 	    }
 	  } else {
 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
diff --git a/lib/qcd/hmc/HMC.h b/lib/qcd/hmc/HMC.h
index ac690b60..5688bb24 100644
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
 
   template < class ReaderClass > 
   void initialize(Reader<ReaderClass> &TheReader){
-  	std::cout << "Reading HMC\n";
+  	std::cout << GridLogMessage << "Reading HMC\n";
   	read(TheReader, "HMC", *this);
   }
 
diff --git a/lib/qcd/hmc/HMCResourceManager.h b/lib/qcd/hmc/HMCResourceManager.h
index 9f4c99a9..cf0000ed 100644
--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@@ -253,6 +253,7 @@ class HMCResourceManager {
   template<class T, class... Types>
   void AddObservable(Types&&... Args){
     ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
+    ObservablesList.back()->print_parameters();
   }
 
   std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@@ -297,4 +298,4 @@ private:
 }
 }
 
-#endif  // HMC_RESOURCE_MANAGER_H
\ No newline at end of file
+#endif  // HMC_RESOURCE_MANAGER_H

From 7d7220cbd72278050a1cfda6a083a87b85fecbca Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 27 Jun 2017 14:38:45 +0100
Subject: [PATCH 04/44] scalar: lambda/4! convention

---
 lib/qcd/action/scalar/ScalarInteractionAction.h | 4 ++--
 tests/hmc/Test_hmc_ScalarActionNxN.cc           | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index 1ff8fd37..ac2d4fbb 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -81,7 +81,7 @@ namespace Grid {
       phiStencil.HaloExchange(p, compressor);
       Field action(p._grid), pshift(p._grid), phisquared(p._grid);
       phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared;
+      action = (2.0*Ndim + mass_square)*phisquared + lambda/24.*phisquared*phisquared;
       for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
@@ -113,7 +113,7 @@ namespace Grid {
 
     virtual void deriv(const Field &p, Field &force) {
       assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p;
+      force = (2.0*Ndim + mass_square)*p + lambda/12.*p*p*p;
       // move this outside
       static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
       phiStencil.HaloExchange(p, compressor);
diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index a7490f51..a4dad1a3 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -45,7 +45,7 @@ using namespace Grid;
 using namespace Grid::QCD;
 
 template <class Impl>
-class MagLogger : public HmcObservable<typename Impl::Field> {
+class MagMeas : public HmcObservable<typename Impl::Field> {
 public:
   typedef typename Impl::Field Field;
   typedef typename Impl::Simd::scalar_type Trace;
@@ -72,13 +72,13 @@ private:
 };
 
 template <class Impl>
-class MagMod: public ObservableModule<MagLogger<Impl>, NoParameters>{
-  typedef ObservableModule<MagLogger<Impl>, NoParameters> ObsBase;
+class MagMod: public ObservableModule<MagMeas<Impl>, NoParameters>{
+  typedef ObservableModule<MagMeas<Impl>, NoParameters> ObsBase;
   using ObsBase::ObsBase; // for constructors
   
   // acquire resource
   virtual void initialize(){
-    this->ObservablePtr.reset(new MagLogger<Impl>());
+    this->ObservablePtr.reset(new MagMeas<Impl>());
   }
 public:
   MagMod(): ObsBase(NoParameters()){}

From 15e87a460725f07dd380bd21b538b43b687a0551 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 27 Jun 2017 14:39:27 +0100
Subject: [PATCH 05/44] HDF5 IO fix

---
 lib/serialisation/Hdf5IO.cc | 4 +++-
 lib/serialisation/Hdf5IO.h  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/serialisation/Hdf5IO.cc b/lib/serialisation/Hdf5IO.cc
index b9bb0b87..1fb7be0c 100644
--- a/lib/serialisation/Hdf5IO.cc
+++ b/lib/serialisation/Hdf5IO.cc
@@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
                       Hdf5Type<unsigned int>::type());
 }
 
-void Hdf5Reader::push(const std::string &s)
+bool Hdf5Reader::push(const std::string &s)
 {
   group_ = group_.openGroup(s);
   path_.push_back(s);
+  
+  return true;
 }
 
 void Hdf5Reader::pop(void)
diff --git a/lib/serialisation/Hdf5IO.h b/lib/serialisation/Hdf5IO.h
index 2f891cd4..94ad9736 100644
--- a/lib/serialisation/Hdf5IO.h
+++ b/lib/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
   public:
     Hdf5Reader(const std::string &fileName);
     virtual ~Hdf5Reader(void) = default;
-    void push(const std::string &s);
+    bool push(const std::string &s);
     void pop(void);
     template <typename U>
     void readDefault(const std::string &s, U &output);

From bf729766ddd36c9ebe8c6be35d7527353aff2963 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 27 Jun 2017 20:32:24 +0200
Subject: [PATCH 06/44] removed collision with QPX implementation

---
 lib/simd/Grid_generic_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h
index eac65e09..642f6ffe 100644
--- a/lib/simd/Grid_generic_types.h
+++ b/lib/simd/Grid_generic_types.h
@@ -26,7 +26,7 @@ Author: Antonin Portelli <antonin.portelli@me.com>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-#define GEN_SIMD_WIDTH 16
+
 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
 
 //#define VECTOR_LOOPS

From e43a8b6b8ad52b88ed8e9ae1c140dc34a3da18d4 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 27 Jun 2017 20:58:48 +0200
Subject: [PATCH 07/44] removed comments

---
 Grid_vector_types.h | 857 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 857 insertions(+)
 create mode 100644 Grid_vector_types.h

diff --git a/Grid_vector_types.h b/Grid_vector_types.h
new file mode 100644
index 00000000..e05fecc4
--- /dev/null
+++ b/Grid_vector_types.h
@@ -0,0 +1,857 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/simd/Grid_vector_type.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//---------------------------------------------------------------------------
+/*! @file Grid_vector_types.h
+  @brief Defines templated class Grid_simd to deal with inner vector types
+*/
+// Time-stamp: <2015-07-10 17:45:33 neo>
+//---------------------------------------------------------------------------
+#ifndef GRID_VECTOR_TYPES
+#define GRID_VECTOR_TYPES
+
+#ifdef GEN
+#include "Grid_generic.h"
+#endif
+#ifdef SSE4
+#include "Grid_sse4.h"
+#endif
+#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
+#include "Grid_avx.h"
+#endif
+#if defined AVX512
+#include "Grid_avx512.h"
+#endif
+#if defined IMCI
+#include "Grid_imci.h"
+#endif
+#ifdef NEONv8
+#include "Grid_neon.h"
+#endif
+#if defined QPX
+#include "Grid_qpx.h"
+#endif
+
+#include "l1p.h"
+
+namespace Grid {
+
+//////////////////////////////////////
+// To take the floating point type of real/complex type
+//////////////////////////////////////
+template <typename T>
+struct RealPart {
+  typedef T type;
+};
+template <typename T>
+struct RealPart<std::complex<T> > {
+  typedef T type;
+};
+
+#include <type_traits>
+
+//////////////////////////////////////
+// demote a vector to real type
+//////////////////////////////////////
+// type alias used to simplify the syntax of std::enable_if
+template <typename T> using Invoke = typename T::type;
+template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
+
+////////////////////////////////////////////////////////
+// Check for complexity with type traits
+template <typename T> struct is_complex : public std::false_type {};
+template <> struct is_complex<std::complex<double> > : public std::true_type {};
+template <> struct is_complex<std::complex<float> > : public std::true_type {};
+
+template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
+template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
+template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
+
+template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
+template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
+template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
+template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
+
+////////////////////////////////////////////////////////
+// Define the operation templates functors
+// general forms to allow for vsplat syntax
+// need explicit declaration of types when used since
+// clang cannot automatically determine the output type sometimes
+template <class Out, class Input1, class Input2, class Input3, class Operation>
+Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
+  return op(src_1, src_2, src_3);
+}
+
+template <class Out, class Input1, class Input2, class Operation>
+Out binary(Input1 src_1, Input2 src_2, Operation op) {
+  return op(src_1, src_2);
+}
+
+template <class Out, class Input, class Operation>
+Out unary(Input src, Operation op) {
+  return op(src);
+}
+///////////////////////////////////////////////
+
+/*
+  @brief Grid_simd class for the SIMD vector type operations
+ */
+template <class Scalar_type, class Vector_type>
+class Grid_simd {
+ public:
+  typedef typename RealPart<Scalar_type>::type Real;
+  typedef Vector_type vector_type;
+  typedef Scalar_type scalar_type;
+
+  typedef union conv_t_union {
+    Vector_type v;
+    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
+    conv_t_union(){};
+  } conv_t;
+
+  Vector_type v;
+
+  static inline constexpr int Nsimd(void) {
+    return sizeof(Vector_type) / sizeof(Scalar_type);
+  }
+
+  Grid_simd &operator=(const Grid_simd &&rhs) {
+    v = rhs.v;
+    return *this;
+  };
+  Grid_simd &operator=(const Grid_simd &rhs) {
+    v = rhs.v;
+    return *this;
+  };  // faster than not declaring it and leaving to the compiler
+  Grid_simd() = default;
+  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
+  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
+
+  /////////////////////////////
+  // Constructors
+  /////////////////////////////
+  Grid_simd &operator=(Zero &z) {
+    vzero(*this);
+    return (*this);
+  }
+
+  // Enable if complex type
+  template <typename S = Scalar_type>
+  Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
+    vsplat(*this, a);
+  };
+
+  Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
+
+  ///////////////////////////////////////////////
+  // mac, mult, sub, add, adj
+  ///////////////////////////////////////////////
+
+  // FIXME -- alias this to an inline MAC struct.
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ a,
+                         const Grid_simd *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Scalar_type *__restrict__ l,
+                          const Grid_simd *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Scalar_type *__restrict__ l,
+                         const Grid_simd *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  friend inline void mac(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ a,
+                         const Scalar_type *__restrict__ x) {
+    *y = (*a) * (*x) + (*y);
+  };
+  friend inline void mult(Grid_simd *__restrict__ y,
+                          const Grid_simd *__restrict__ l,
+                          const Scalar_type *__restrict__ r) {
+    *y = (*l) * (*r);
+  }
+  friend inline void sub(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) - (*r);
+  }
+  friend inline void add(Grid_simd *__restrict__ y,
+                         const Grid_simd *__restrict__ l,
+                         const Scalar_type *__restrict__ r) {
+    *y = (*l) + (*r);
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // FIXME:  gonna remove these load/store, get, set, prefetch
+  ////////////////////////////////////////////////////////////////////////
+  friend inline void vset(Grid_simd &ret, Scalar_type *a) {
+    ret.v = unary<Vector_type>(a, VsetSIMD());
+  }
+
+  ///////////////////////
+  // Vstore
+  ///////////////////////
+  friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
+    binary<void>(ret.v, (Real *)a, VstoreSIMD());
+  }
+
+  ///////////////////////
+  // Vprefetch
+  ///////////////////////
+  friend inline void vprefetch(const Grid_simd &v) {
+    prefetch_HINT_T0((const char *)&v.v);
+  }
+
+  ///////////////////////
+  // Reduce
+  ///////////////////////
+  friend inline Scalar_type Reduce(const Grid_simd &in) {
+    return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
+  }
+
+  ////////////////////////////
+  // operator scalar * simd
+  ////////////////////////////
+  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va * b;
+  }
+  friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
+    return a * b;
+  }
+
+  //////////////////////////////////
+  // Divides
+  //////////////////////////////////
+  friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va / b;
+  }
+  friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
+    Grid_simd va;
+    vsplat(va, a);
+    return b / a;
+  }
+
+  ///////////////////////
+  // Unary negation
+  ///////////////////////
+  friend inline Grid_simd operator-(const Grid_simd &r) {
+    Grid_simd ret;
+    vzero(ret);
+    ret = ret - r;
+    return ret;
+  }
+  // *=,+=,-= operators
+  inline Grid_simd &operator*=(const Grid_simd &r) {
+    *this = (*this) * r;
+    return *this;
+    // return (*this)*r; ?
+  }
+  inline Grid_simd &operator+=(const Grid_simd &r) {
+    *this = *this + r;
+    return *this;
+  }
+  inline Grid_simd &operator-=(const Grid_simd &r) {
+    *this = *this - r;
+    return *this;
+  }
+
+  ///////////////////////////////////////
+  // Not all functions are supported
+  // through SIMD and must breakout to
+  // scalar type and back again. This
+  // provides support
+  ///////////////////////////////////////
+
+  template <class functor>
+  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
+    Grid_simd ret;
+    Grid_simd::conv_t conv;
+    Grid_simd::scalar_type s;
+    
+    conv.v = v.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      s = conv.s[i];
+      conv.s[i] = func(s);
+    }
+    ret.v = conv.v;
+    return ret;
+  }
+  template <class functor>
+  friend inline Grid_simd SimdApplyBinop(const functor &func,
+                                         const Grid_simd &x,
+                                         const Grid_simd &y) {
+    Grid_simd ret;
+    Grid_simd::conv_t cx;
+    Grid_simd::conv_t cy;
+    Grid_simd::scalar_type sx,sy;
+
+    cx.v = x.v;
+    cy.v = y.v;
+    for (int i = 0; i < Nsimd(); i++) {
+      sx = cx.s[i];
+      sy = cy.s[i];
+      cx.s[i] = func(sx,sy);
+    }
+    ret.v = cx.v;
+    return ret;
+  }
+  ///////////////////////
+  // Exchange 
+  // Al Ah , Bl Bh -> Al Bl Ah,Bh
+  ///////////////////////
+  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
+  {
+    if       (n==3) {
+      Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==2) {
+      Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==1) {
+      Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+    } else if(n==0) { 
+      Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////
+  // General permute; assumes vector length is same across
+  // all subtypes; may not be a good assumption, but could
+  // add the vector width as a template param for BG/Q for example
+  ////////////////////////////////////////////////////////////////////
+  friend inline void permute0(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute0(b.v);
+  }
+  friend inline void permute1(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute1(b.v);
+  }
+  friend inline void permute2(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute2(b.v);
+  }
+  friend inline void permute3(Grid_simd &y, Grid_simd b) {
+    y.v = Optimization::Permute::Permute3(b.v);
+  }
+  friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
+    if (perm & RotateBit) {
+      int dist = perm & 0xF;
+      y = rotate(b, dist);
+      return;
+    } 
+    else if(perm==3) permute3(y, b);
+    else if(perm==2) permute2(y, b);
+    else if(perm==1) permute1(y, b);
+    else if(perm==0) permute0(y, b);
+  }
+
+  ///////////////////////////////
+  // Getting single lanes
+  ///////////////////////////////
+  inline Scalar_type getlane(int lane) {
+    return ((Scalar_type*)&v)[lane];
+  }
+
+  inline void putlane(const Scalar_type &S, int lane){
+    ((Scalar_type*)&v)[lane] = S;
+  }
+
+
+  
+};  // end of Grid_simd class definition
+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
+////////////////////////////////////////////////////////////////////
+// General rotate
+////////////////////////////////////////////////////////////////////
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  ret.v = Optimization::Rotate::rotate(b.v, nrot);
+  return ret;
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
+  nrot = nrot % Grid_simd<S, V>::Nsimd();
+  Grid_simd<S, V> ret;
+  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> =0> 
+inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+{
+  nrot = nrot % Grid_simd<S,V>::Nsimd();
+  ret.v = Optimization::Rotate::rotate(b.v,nrot);
+}
+template <class S, class V, IfComplex<S> =0> 
+inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+{
+  nrot = nrot % Grid_simd<S,V>::Nsimd();
+  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
+}
+
+template <class S, class V> 
+inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  vsplat(ret,typepun[lane]);
+}    
+template <class S, class V, IfComplex<S> =0> 
+inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
+}    
+
+
+
+///////////////////////
+// Splat
+///////////////////////
+
+// this is only for the complex version
+template <class S, class V, IfComplex<S> = 0, class ABtype>
+inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
+  ret.v = binary<V>(a, b, VsplatSIMD());
+}
+
+// overload if complex
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), imag(c));
+}
+template <class S, class V>
+inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), real(c));
+}
+
+// if real fill with a, if complex fill with a in the real part (first function
+// above)
+template <class S, class V>
+inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
+  ret.v = unary<V>(a, VsplatSIMD());
+}
+//////////////////////////
+
+///////////////////////////////////////////////
+// Initialise to 1,0,i for the correct types
+///////////////////////////////////////////////
+// For complex types
+template <class S, class V, IfComplex<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, 0.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 0.0));
+}  // use xor?
+template <class S, class V, IfComplex<S> = 0>
+inline void vcomplex_i(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0, 1.0));
+}
+
+template <class S, class V, IfComplex<S> = 0>
+inline void visign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0, -1.0));
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vrsign(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(-1.0, 1.0));
+}
+
+// if not complex overload here
+template <class S, class V, IfReal<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(1.0));
+}
+template <class S, class V, IfReal<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, S(0.0));
+}
+
+// For integral types
+template <class S, class V, IfInteger<S> = 0>
+inline void vone(Grid_simd<S, V> &ret) {
+  vsplat(ret, 1);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vzero(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vtrue(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0xFFFFFFFF);
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vfalse(Grid_simd<S, V> &ret) {
+  vsplat(ret, 0);
+}
+template <class S, class V>
+inline void zeroit(Grid_simd<S, V> &z) {
+  vzero(z);
+}
+
+///////////////////////
+// Vstream
+///////////////////////
+template <class S, class V, IfReal<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  binary<void>((S *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  typedef typename S::value_type T;
+  binary<void>((T *)&out.v, in.v, VstreamSIMD());
+}
+template <class S, class V, IfInteger<S> = 0>
+inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
+  out = in;
+}
+
+////////////////////////////////////
+// Arithmetic operator overloads +,-,*
+////////////////////////////////////
+template <class S, class V>
+inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SumSIMD());
+  return ret;
+};
+
+template <class S, class V>
+inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, SubSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
+  Grid_simd<S, V> ret;
+  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
+  return ret;
+};
+
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultSIMD());
+  return ret;
+};
+
+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  typedef Grid_simd<S, V> simd;
+
+  simd ret;
+  simd den;
+  typename simd::conv_t conv;
+
+  ret = a * conjugate(b) ;
+  den = b * conjugate(b) ;
+
+  
+  auto real_den = toReal(den);
+
+  ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
+
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, DivSIMD());
+  return ret;
+};
+
+///////////////////////
+// Conjugate
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  ret.v = unary<V>(in.v, ConjSIMD());
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
+  return in;  // for real objects
+}
+// Suppress adj for integer types... // odd; why conjugate above but not adj??
+template <class S, class V, IfNotInteger<S> = 0>
+inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
+  return conjugate(in);
+}
+
+///////////////////////
+// timesMinusI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesMinusI(ret, in);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+///////////////////////
+// timesI
+///////////////////////
+template <class S, class V, IfComplex<S> = 0>
+inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
+  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
+}
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  Grid_simd<S, V> ret;
+  timesI(ret, in);
+  return ret;
+}
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
+  return in;
+}
+
+/////////////////////
+// Inner, outer
+/////////////////////
+
+template <class S, class V>
+inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return conjugate(l) * r;
+}
+template <class S, class V>
+inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
+                                    const Grid_simd<S, V> &r) {
+  return l * conjugate(r);
+}
+
+template <class S, class V>
+inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
+  return arg;
+}
+
+////////////////////////////////////////////////////////////
+// copy/splat complex real parts into real;
+// insert real into complex and zero imag;
+////////////////////////////////////////////////////////////
+
+// real = toReal( complex )
+template <class S, class V, IfReal<S> = 0>
+inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
+  typedef Grid_simd<S, V> simd;
+  simd ret;
+  typename simd::conv_t conv;
+  conv.v = in.v;  // copy the vector content (bytewise)
+  for (int i = 0; i < simd::Nsimd(); i += 2) {
+    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
+  }
+  ret.v = conv.v;
+  return ret;
+}
+
+// complex = toComplex( real )
+template <class R, class V, IfReal<R> = 0>  // must be a real arg
+inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
+  typedef Grid_simd<R, V> Rsimd;
+  typedef Grid_simd<std::complex<R>, V> Csimd;
+  typename Rsimd::conv_t conv;  // address as real
+
+  conv.v = in.v;
+  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
+    // indicating the SIMD grids of real and imag assignment did not correctly
+    // match
+    conv.s[i + 1] = 0.0;  // zero imaginary parts
+  }
+  Csimd ret;
+  ret.v = conv.v;
+  return ret;
+}
+
+///////////////////////////////
+// Define available types
+///////////////////////////////
+typedef Grid_simd<float, SIMD_Ftype> vRealF;
+typedef Grid_simd<double, SIMD_Dtype> vRealD;
+typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
+typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
+typedef Grid_simd<Integer, SIMD_Itype> vInteger;
+
+// Half precision; no arithmetic support
+typedef Grid_simd<uint16_t, SIMD_Htype>               vRealH;
+typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
+
+inline void precisionChange(vRealF    *out,vRealD    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
+  }
+}
+inline void precisionChange(vRealH    *out,vRealD    *in,int nvec)
+{
+  assert((nvec&0x3)==0);
+  for(int m=0;m*4<nvec;m++){
+    int n=m*4;
+    out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
+  }
+}
+inline void precisionChange(vRealH    *out,vRealF    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
+  }
+}
+inline void precisionChange(vRealD    *out,vRealF    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
+  }
+}
+inline void precisionChange(vRealD    *out,vRealH    *in,int nvec)
+{
+  assert((nvec&0x3)==0);
+  for(int m=0;m*4<nvec;m++){
+    int n=m*4;
+    Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
+  }
+}
+inline void precisionChange(vRealF    *out,vRealH    *in,int nvec)
+{
+  assert((nvec&0x1)==0);
+  for(int m=0;m*2<nvec;m++){
+    int n=m*2;
+    Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
+  }
+}
+inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
+inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
+inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
+inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
+inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
+inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
+
+// Check our vector types are of an appropriate size.
+#if defined QPX
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#else
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#endif
+
+/////////////////////////////////////////
+// Some traits to recognise the types
+/////////////////////////////////////////
+template <typename T>
+struct is_simd : public std::false_type {};
+template <> struct is_simd<vRealF>     : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vComplexF>  : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vInteger>   : public std::true_type {};
+
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
+}
+
+#endif

From a9c816a26883056c367ea5972bca41fc1b24b3a3 Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Tue, 27 Jun 2017 21:39:15 +0200
Subject: [PATCH 08/44] moved file to correct folder

---
 Grid_vector_types.h          | 857 -----------------------------------
 lib/simd/Grid_vector_types.h |  75 +--
 2 files changed, 22 insertions(+), 910 deletions(-)
 delete mode 100644 Grid_vector_types.h

diff --git a/Grid_vector_types.h b/Grid_vector_types.h
deleted file mode 100644
index e05fecc4..00000000
--- a/Grid_vector_types.h
+++ /dev/null
@@ -1,857 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/simd/Grid_vector_type.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Guido Cossu <cossu@iroiro-pc.kek.jp>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-//---------------------------------------------------------------------------
-/*! @file Grid_vector_types.h
-  @brief Defines templated class Grid_simd to deal with inner vector types
-*/
-// Time-stamp: <2015-07-10 17:45:33 neo>
-//---------------------------------------------------------------------------
-#ifndef GRID_VECTOR_TYPES
-#define GRID_VECTOR_TYPES
-
-#ifdef GEN
-#include "Grid_generic.h"
-#endif
-#ifdef SSE4
-#include "Grid_sse4.h"
-#endif
-#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
-#include "Grid_avx.h"
-#endif
-#if defined AVX512
-#include "Grid_avx512.h"
-#endif
-#if defined IMCI
-#include "Grid_imci.h"
-#endif
-#ifdef NEONv8
-#include "Grid_neon.h"
-#endif
-#if defined QPX
-#include "Grid_qpx.h"
-#endif
-
-#include "l1p.h"
-
-namespace Grid {
-
-//////////////////////////////////////
-// To take the floating point type of real/complex type
-//////////////////////////////////////
-template <typename T>
-struct RealPart {
-  typedef T type;
-};
-template <typename T>
-struct RealPart<std::complex<T> > {
-  typedef T type;
-};
-
-#include <type_traits>
-
-//////////////////////////////////////
-// demote a vector to real type
-//////////////////////////////////////
-// type alias used to simplify the syntax of std::enable_if
-template <typename T> using Invoke = typename T::type;
-template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
-template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
-
-////////////////////////////////////////////////////////
-// Check for complexity with type traits
-template <typename T> struct is_complex : public std::false_type {};
-template <> struct is_complex<std::complex<double> > : public std::true_type {};
-template <> struct is_complex<std::complex<float> > : public std::true_type {};
-
-template <typename T>              using IfReal    = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
-template <typename T>              using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
-template <typename T>              using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
-template <typename T1,typename T2> using IfSame    = Invoke<std::enable_if<std::is_same<T1,T2>::value, int> >;
-
-template <typename T>              using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
-template <typename T>              using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
-template <typename T>              using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
-template <typename T1,typename T2> using IfNotSame    = Invoke<std::enable_if<!std::is_same<T1,T2>::value, int> >;
-
-////////////////////////////////////////////////////////
-// Define the operation templates functors
-// general forms to allow for vsplat syntax
-// need explicit declaration of types when used since
-// clang cannot automatically determine the output type sometimes
-template <class Out, class Input1, class Input2, class Input3, class Operation>
-Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
-  return op(src_1, src_2, src_3);
-}
-
-template <class Out, class Input1, class Input2, class Operation>
-Out binary(Input1 src_1, Input2 src_2, Operation op) {
-  return op(src_1, src_2);
-}
-
-template <class Out, class Input, class Operation>
-Out unary(Input src, Operation op) {
-  return op(src);
-}
-///////////////////////////////////////////////
-
-/*
-  @brief Grid_simd class for the SIMD vector type operations
- */
-template <class Scalar_type, class Vector_type>
-class Grid_simd {
- public:
-  typedef typename RealPart<Scalar_type>::type Real;
-  typedef Vector_type vector_type;
-  typedef Scalar_type scalar_type;
-
-  typedef union conv_t_union {
-    Vector_type v;
-    Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
-    conv_t_union(){};
-  } conv_t;
-
-  Vector_type v;
-
-  static inline constexpr int Nsimd(void) {
-    return sizeof(Vector_type) / sizeof(Scalar_type);
-  }
-
-  Grid_simd &operator=(const Grid_simd &&rhs) {
-    v = rhs.v;
-    return *this;
-  };
-  Grid_simd &operator=(const Grid_simd &rhs) {
-    v = rhs.v;
-    return *this;
-  };  // faster than not declaring it and leaving to the compiler
-  Grid_simd() = default;
-  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
-  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
-
-  /////////////////////////////
-  // Constructors
-  /////////////////////////////
-  Grid_simd &operator=(Zero &z) {
-    vzero(*this);
-    return (*this);
-  }
-
-  // Enable if complex type
-  template <typename S = Scalar_type>
-  Grid_simd(const typename std::enable_if<is_complex<S>::value, S>::type a) {
-    vsplat(*this, a);
-  };
-
-  Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); };
-
-  ///////////////////////////////////////////////
-  // mac, mult, sub, add, adj
-  ///////////////////////////////////////////////
-
-  // FIXME -- alias this to an inline MAC struct.
-  friend inline void mac(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ a,
-                         const Grid_simd *__restrict__ x) {
-    *y = (*a) * (*x) + (*y);
-  };
-
-  friend inline void mult(Grid_simd *__restrict__ y,
-                          const Grid_simd *__restrict__ l,
-                          const Grid_simd *__restrict__ r) {
-    *y = (*l) * (*r);
-  }
-
-  friend inline void sub(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) - (*r);
-  }
-  friend inline void add(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) + (*r);
-  }
-  friend inline void mac(Grid_simd *__restrict__ y,
-                         const Scalar_type *__restrict__ a,
-                         const Grid_simd *__restrict__ x) {
-    *y = (*a) * (*x) + (*y);
-  };
-  friend inline void mult(Grid_simd *__restrict__ y,
-                          const Scalar_type *__restrict__ l,
-                          const Grid_simd *__restrict__ r) {
-    *y = (*l) * (*r);
-  }
-  friend inline void sub(Grid_simd *__restrict__ y,
-                         const Scalar_type *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) - (*r);
-  }
-  friend inline void add(Grid_simd *__restrict__ y,
-                         const Scalar_type *__restrict__ l,
-                         const Grid_simd *__restrict__ r) {
-    *y = (*l) + (*r);
-  }
-
-  friend inline void mac(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ a,
-                         const Scalar_type *__restrict__ x) {
-    *y = (*a) * (*x) + (*y);
-  };
-  friend inline void mult(Grid_simd *__restrict__ y,
-                          const Grid_simd *__restrict__ l,
-                          const Scalar_type *__restrict__ r) {
-    *y = (*l) * (*r);
-  }
-  friend inline void sub(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Scalar_type *__restrict__ r) {
-    *y = (*l) - (*r);
-  }
-  friend inline void add(Grid_simd *__restrict__ y,
-                         const Grid_simd *__restrict__ l,
-                         const Scalar_type *__restrict__ r) {
-    *y = (*l) + (*r);
-  }
-
-  ////////////////////////////////////////////////////////////////////////
-  // FIXME:  gonna remove these load/store, get, set, prefetch
-  ////////////////////////////////////////////////////////////////////////
-  friend inline void vset(Grid_simd &ret, Scalar_type *a) {
-    ret.v = unary<Vector_type>(a, VsetSIMD());
-  }
-
-  ///////////////////////
-  // Vstore
-  ///////////////////////
-  friend inline void vstore(const Grid_simd &ret, Scalar_type *a) {
-    binary<void>(ret.v, (Real *)a, VstoreSIMD());
-  }
-
-  ///////////////////////
-  // Vprefetch
-  ///////////////////////
-  friend inline void vprefetch(const Grid_simd &v) {
-    prefetch_HINT_T0((const char *)&v.v);
-  }
-
-  ///////////////////////
-  // Reduce
-  ///////////////////////
-  friend inline Scalar_type Reduce(const Grid_simd &in) {
-    return unary<Scalar_type>(in.v, ReduceSIMD<Scalar_type, Vector_type>());
-  }
-
-  ////////////////////////////
-  // operator scalar * simd
-  ////////////////////////////
-  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
-    Grid_simd va;
-    vsplat(va, a);
-    return va * b;
-  }
-  friend inline Grid_simd operator*(Grid_simd b, const Scalar_type &a) {
-    return a * b;
-  }
-
-  //////////////////////////////////
-  // Divides
-  //////////////////////////////////
-  friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
-    Grid_simd va;
-    vsplat(va, a);
-    return va / b;
-  }
-  friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
-    Grid_simd va;
-    vsplat(va, a);
-    return b / a;
-  }
-
-  ///////////////////////
-  // Unary negation
-  ///////////////////////
-  friend inline Grid_simd operator-(const Grid_simd &r) {
-    Grid_simd ret;
-    vzero(ret);
-    ret = ret - r;
-    return ret;
-  }
-  // *=,+=,-= operators
-  inline Grid_simd &operator*=(const Grid_simd &r) {
-    *this = (*this) * r;
-    return *this;
-    // return (*this)*r; ?
-  }
-  inline Grid_simd &operator+=(const Grid_simd &r) {
-    *this = *this + r;
-    return *this;
-  }
-  inline Grid_simd &operator-=(const Grid_simd &r) {
-    *this = *this - r;
-    return *this;
-  }
-
-  ///////////////////////////////////////
-  // Not all functions are supported
-  // through SIMD and must breakout to
-  // scalar type and back again. This
-  // provides support
-  ///////////////////////////////////////
-
-  template <class functor>
-  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
-    Grid_simd ret;
-    Grid_simd::conv_t conv;
-    Grid_simd::scalar_type s;
-    
-    conv.v = v.v;
-    for (int i = 0; i < Nsimd(); i++) {
-      s = conv.s[i];
-      conv.s[i] = func(s);
-    }
-    ret.v = conv.v;
-    return ret;
-  }
-  template <class functor>
-  friend inline Grid_simd SimdApplyBinop(const functor &func,
-                                         const Grid_simd &x,
-                                         const Grid_simd &y) {
-    Grid_simd ret;
-    Grid_simd::conv_t cx;
-    Grid_simd::conv_t cy;
-    Grid_simd::scalar_type sx,sy;
-
-    cx.v = x.v;
-    cy.v = y.v;
-    for (int i = 0; i < Nsimd(); i++) {
-      sx = cx.s[i];
-      sy = cy.s[i];
-      cx.s[i] = func(sx,sy);
-    }
-    ret.v = cx.v;
-    return ret;
-  }
-  ///////////////////////
-  // Exchange 
-  // Al Ah , Bl Bh -> Al Bl Ah,Bh
-  ///////////////////////
-  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
-  {
-    if       (n==3) {
-      Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==2) {
-      Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==1) {
-      Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) { 
-      Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////
-  // General permute; assumes vector length is same across
-  // all subtypes; may not be a good assumption, but could
-  // add the vector width as a template param for BG/Q for example
-  ////////////////////////////////////////////////////////////////////
-  friend inline void permute0(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute0(b.v);
-  }
-  friend inline void permute1(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute1(b.v);
-  }
-  friend inline void permute2(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute2(b.v);
-  }
-  friend inline void permute3(Grid_simd &y, Grid_simd b) {
-    y.v = Optimization::Permute::Permute3(b.v);
-  }
-  friend inline void permute(Grid_simd &y, Grid_simd b, int perm) {
-    if (perm & RotateBit) {
-      int dist = perm & 0xF;
-      y = rotate(b, dist);
-      return;
-    } 
-    else if(perm==3) permute3(y, b);
-    else if(perm==2) permute2(y, b);
-    else if(perm==1) permute1(y, b);
-    else if(perm==0) permute0(y, b);
-  }
-
-  ///////////////////////////////
-  // Getting single lanes
-  ///////////////////////////////
-  inline Scalar_type getlane(int lane) {
-    return ((Scalar_type*)&v)[lane];
-  }
-
-  inline void putlane(const Scalar_type &S, int lane){
-    ((Scalar_type*)&v)[lane] = S;
-  }
-
-
-  
-};  // end of Grid_simd class definition
-
-inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
-inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
-inline void permute(RealD &y,RealD b, int perm) {  y=b; }
-inline void permute(RealF &y,RealF b, int perm) {  y=b; }
-
-////////////////////////////////////////////////////////////////////
-// General rotate
-////////////////////////////////////////////////////////////////////
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
-  nrot = nrot % Grid_simd<S, V>::Nsimd();
-  Grid_simd<S, V> ret;
-  ret.v = Optimization::Rotate::rotate(b.v, nrot);
-  return ret;
-}
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
-  nrot = nrot % Grid_simd<S, V>::Nsimd();
-  Grid_simd<S, V> ret;
-  ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> =0> 
-inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
-{
-  nrot = nrot % Grid_simd<S,V>::Nsimd();
-  ret.v = Optimization::Rotate::rotate(b.v,nrot);
-}
-template <class S, class V, IfComplex<S> =0> 
-inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
-{
-  nrot = nrot % Grid_simd<S,V>::Nsimd();
-  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
-}
-
-template <class S, class V> 
-inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
-  S* typepun =(S*) &src;
-  vsplat(ret,typepun[lane]);
-}    
-template <class S, class V, IfComplex<S> =0> 
-inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
-  S* typepun =(S*) &src;
-  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}    
-
-
-
-///////////////////////
-// Splat
-///////////////////////
-
-// this is only for the complex version
-template <class S, class V, IfComplex<S> = 0, class ABtype>
-inline void vsplat(Grid_simd<S, V> &ret, ABtype a, ABtype b) {
-  ret.v = binary<V>(a, b, VsplatSIMD());
-}
-
-// overload if complex
-template <class S, class V>
-inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
-  vsplat(ret, real(c), imag(c));
-}
-template <class S, class V>
-inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
-  vsplat(ret, real(c), real(c));
-}
-
-// if real fill with a, if complex fill with a in the real part (first function
-// above)
-template <class S, class V>
-inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>, S> a) {
-  ret.v = unary<V>(a, VsplatSIMD());
-}
-//////////////////////////
-
-///////////////////////////////////////////////
-// Initialise to 1,0,i for the correct types
-///////////////////////////////////////////////
-// For complex types
-template <class S, class V, IfComplex<S> = 0>
-inline void vone(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(1.0, 0.0));
-}
-template <class S, class V, IfComplex<S> = 0>
-inline void vzero(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(0.0, 0.0));
-}  // use xor?
-template <class S, class V, IfComplex<S> = 0>
-inline void vcomplex_i(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(0.0, 1.0));
-}
-
-template <class S, class V, IfComplex<S> = 0>
-inline void visign(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(1.0, -1.0));
-}
-template <class S, class V, IfComplex<S> = 0>
-inline void vrsign(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(-1.0, 1.0));
-}
-
-// if not complex overload here
-template <class S, class V, IfReal<S> = 0>
-inline void vone(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(1.0));
-}
-template <class S, class V, IfReal<S> = 0>
-inline void vzero(Grid_simd<S, V> &ret) {
-  vsplat(ret, S(0.0));
-}
-
-// For integral types
-template <class S, class V, IfInteger<S> = 0>
-inline void vone(Grid_simd<S, V> &ret) {
-  vsplat(ret, 1);
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vzero(Grid_simd<S, V> &ret) {
-  vsplat(ret, 0);
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vtrue(Grid_simd<S, V> &ret) {
-  vsplat(ret, 0xFFFFFFFF);
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vfalse(Grid_simd<S, V> &ret) {
-  vsplat(ret, 0);
-}
-template <class S, class V>
-inline void zeroit(Grid_simd<S, V> &z) {
-  vzero(z);
-}
-
-///////////////////////
-// Vstream
-///////////////////////
-template <class S, class V, IfReal<S> = 0>
-inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
-  binary<void>((S *)&out.v, in.v, VstreamSIMD());
-}
-template <class S, class V, IfComplex<S> = 0>
-inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
-  typedef typename S::value_type T;
-  binary<void>((T *)&out.v, in.v, VstreamSIMD());
-}
-template <class S, class V, IfInteger<S> = 0>
-inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
-  out = in;
-}
-
-////////////////////////////////////
-// Arithmetic operator overloads +,-,*
-////////////////////////////////////
-template <class S, class V>
-inline Grid_simd<S, V> operator+(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, SumSIMD());
-  return ret;
-};
-
-template <class S, class V>
-inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, SubSIMD());
-  return ret;
-};
-
-// Distinguish between complex types and others
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
-  return ret;
-};
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
-  Grid_simd<S, V> ret;
-  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
-  return ret;
-};
-
-
-// Distinguish between complex types and others
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, MultComplexSIMD());
-  return ret;
-};
-
-// Real/Integer types
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, MultSIMD());
-  return ret;
-};
-
-// Distinguish between complex types and others
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  typedef Grid_simd<S, V> simd;
-
-  simd ret;
-  simd den;
-  typename simd::conv_t conv;
-
-  ret = a * conjugate(b) ;
-  den = b * conjugate(b) ;
-
-  
-  auto real_den = toReal(den);
-
-  ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
-
-  return ret;
-};
-
-// Real/Integer types
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
-  Grid_simd<S, V> ret;
-  ret.v = binary<V>(a.v, b.v, DivSIMD());
-  return ret;
-};
-
-///////////////////////
-// Conjugate
-///////////////////////
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
-  Grid_simd<S, V> ret;
-  ret.v = unary<V>(in.v, ConjSIMD());
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
-  return in;  // for real objects
-}
-// Suppress adj for integer types... // odd; why conjugate above but not adj??
-template <class S, class V, IfNotInteger<S> = 0>
-inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
-  return conjugate(in);
-}
-
-///////////////////////
-// timesMinusI
-///////////////////////
-template <class S, class V, IfComplex<S> = 0>
-inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
-  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
-}
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
-  Grid_simd<S, V> ret;
-  timesMinusI(ret, in);
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
-  return in;
-}
-
-///////////////////////
-// timesI
-///////////////////////
-template <class S, class V, IfComplex<S> = 0>
-inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
-  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
-}
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
-  Grid_simd<S, V> ret;
-  timesI(ret, in);
-  return ret;
-}
-template <class S, class V, IfNotComplex<S> = 0>
-inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
-  return in;
-}
-
-/////////////////////
-// Inner, outer
-/////////////////////
-
-template <class S, class V>
-inline Grid_simd<S, V> innerProduct(const Grid_simd<S, V> &l,
-                                    const Grid_simd<S, V> &r) {
-  return conjugate(l) * r;
-}
-template <class S, class V>
-inline Grid_simd<S, V> outerProduct(const Grid_simd<S, V> &l,
-                                    const Grid_simd<S, V> &r) {
-  return l * conjugate(r);
-}
-
-template <class S, class V>
-inline Grid_simd<S, V> trace(const Grid_simd<S, V> &arg) {
-  return arg;
-}
-
-////////////////////////////////////////////////////////////
-// copy/splat complex real parts into real;
-// insert real into complex and zero imag;
-////////////////////////////////////////////////////////////
-
-// real = toReal( complex )
-template <class S, class V, IfReal<S> = 0>
-inline Grid_simd<S, V> toReal(const Grid_simd<std::complex<S>, V> &in) {
-  typedef Grid_simd<S, V> simd;
-  simd ret;
-  typename simd::conv_t conv;
-  conv.v = in.v;  // copy the vector content (bytewise)
-  for (int i = 0; i < simd::Nsimd(); i += 2) {
-    conv.s[i + 1] = conv.s[i];  // duplicate (r,r);(r,r);(r,r); etc...
-  }
-  ret.v = conv.v;
-  return ret;
-}
-
-// complex = toComplex( real )
-template <class R, class V, IfReal<R> = 0>  // must be a real arg
-inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
-  typedef Grid_simd<R, V> Rsimd;
-  typedef Grid_simd<std::complex<R>, V> Csimd;
-  typename Rsimd::conv_t conv;  // address as real
-
-  conv.v = in.v;
-  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] == conv.s[i]);  
-    // trap any cases where real was not duplicated
-    // indicating the SIMD grids of real and imag assignment did not correctly
-    // match
-    conv.s[i + 1] = 0.0;  // zero imaginary parts
-  }
-  Csimd ret;
-  ret.v = conv.v;
-  return ret;
-}
-
-///////////////////////////////
-// Define available types
-///////////////////////////////
-typedef Grid_simd<float, SIMD_Ftype> vRealF;
-typedef Grid_simd<double, SIMD_Dtype> vRealD;
-typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
-typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
-typedef Grid_simd<Integer, SIMD_Itype> vInteger;
-
-// Half precision; no arithmetic support
-typedef Grid_simd<uint16_t, SIMD_Htype>               vRealH;
-typedef Grid_simd<std::complex<uint16_t>, SIMD_Htype> vComplexH;
-
-inline void precisionChange(vRealF    *out,vRealD    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
-  }
-}
-inline void precisionChange(vRealH    *out,vRealD    *in,int nvec)
-{
-  assert((nvec&0x3)==0);
-  for(int m=0;m*4<nvec;m++){
-    int n=m*4;
-    out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
-  }
-}
-inline void precisionChange(vRealH    *out,vRealF    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
-  }
-}
-inline void precisionChange(vRealD    *out,vRealF    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    Optimization::PrecisionChange::StoD(in[m].v,out[n].v,out[n+1].v);
-  }
-}
-inline void precisionChange(vRealD    *out,vRealH    *in,int nvec)
-{
-  assert((nvec&0x3)==0);
-  for(int m=0;m*4<nvec;m++){
-    int n=m*4;
-    Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
-  }
-}
-inline void precisionChange(vRealF    *out,vRealH    *in,int nvec)
-{
-  assert((nvec&0x1)==0);
-  for(int m=0;m*2<nvec;m++){
-    int n=m*2;
-    Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
-  }
-}
-inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
-inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
-inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
-inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
-inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
-inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
-
-// Check our vector types are of an appropriate size.
-#if defined QPX
-static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
-static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
-#else
-static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
-static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
-#endif
-
-/////////////////////////////////////////
-// Some traits to recognise the types
-/////////////////////////////////////////
-template <typename T>
-struct is_simd : public std::false_type {};
-template <> struct is_simd<vRealF>     : public std::true_type {};
-template <> struct is_simd<vRealD>     : public std::true_type {};
-template <> struct is_simd<vComplexF>  : public std::true_type {};
-template <> struct is_simd<vComplexD>  : public std::true_type {};
-template <> struct is_simd<vInteger>   : public std::true_type {};
-
-template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
-template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
-}
-
-#endif
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 424b5573..e05fecc4 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -327,16 +327,12 @@ class Grid_simd {
   // provides support
   ///////////////////////////////////////
 
-  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC push_options
-  //#pragma GCC optimize ("O0")
-  //#endif
   template <class functor>
   friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
     Grid_simd ret;
     Grid_simd::conv_t conv;
     Grid_simd::scalar_type s;
-
+    
     conv.v = v.v;
     for (int i = 0; i < Nsimd(); i++) {
       s = conv.s[i];
@@ -364,11 +360,8 @@ class Grid_simd {
     ret.v = cx.v;
     return ret;
   }
-  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
-  //#pragma GCC pop_options
-  //#endif
   ///////////////////////
-  // Exchange
+  // Exchange 
   // Al Ah , Bl Bh -> Al Bl Ah,Bh
   ///////////////////////
   friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
@@ -379,7 +372,7 @@ class Grid_simd {
       Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
     } else if(n==1) {
       Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
-    } else if(n==0) {
+    } else if(n==0) { 
       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
     }
   }
@@ -406,7 +399,7 @@ class Grid_simd {
       int dist = perm & 0xF;
       y = rotate(b, dist);
       return;
-    }
+    } 
     else if(perm==3) permute3(y, b);
     else if(perm==2) permute2(y, b);
     else if(perm==1) permute1(y, b);
@@ -425,10 +418,9 @@ class Grid_simd {
   }
 
 
-
+  
 };  // end of Grid_simd class definition
 
-
 inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
 inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
 inline void permute(RealD &y,RealD b, int perm) {  y=b; }
@@ -451,29 +443,29 @@ inline Grid_simd<S, V> rotate(Grid_simd<S, V> b, int nrot) {
   ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot);
   return ret;
 }
-template <class S, class V, IfNotComplex<S> =0>
+template <class S, class V, IfNotComplex<S> =0> 
 inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,nrot);
 }
-template <class S, class V, IfComplex<S> =0>
+template <class S, class V, IfComplex<S> =0> 
 inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
 {
   nrot = nrot % Grid_simd<S,V>::Nsimd();
   ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 }
 
-template <class S, class V>
+template <class S, class V> 
 inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   vsplat(ret,typepun[lane]);
-}
-template <class S, class V, IfComplex<S> =0>
+}    
+template <class S, class V, IfComplex<S> =0> 
 inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   S* typepun =(S*) &src;
   ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
-}
+}    
 
 
 
@@ -604,27 +596,13 @@ inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
   return ret;
 };
-// TEST for Test_simd
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_mult(std::complex<S> a, std::complex<S> b) {
-  Grid_simd<S, V> ret;
-  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
-  return ret;
-};
-
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
   Grid_simd<S, V> ret;
   ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
   return ret;
 };
-// TEST for Test_simd
-template <class S, class V, IfComplex<S> = 0>
-inline Grid_simd<S, V> real_madd(std::complex<S> a, std::complex<S> b) {
-  Grid_simd<S, V> ret;
-  //ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
-  return ret;
-};
+
 
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
@@ -654,7 +632,7 @@ inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
   ret = a * conjugate(b) ;
   den = b * conjugate(b) ;
 
-
+  
   auto real_den = toReal(den);
 
   ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
@@ -773,8 +751,8 @@ inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
 
   conv.v = in.v;
   for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] ==
-           conv.s[i]);  // trap any cases where real was not duplicated
+    assert(conv.s[i + 1] == conv.s[i]);  
+    // trap any cases where real was not duplicated
     // indicating the SIMD grids of real and imag assignment did not correctly
     // match
     conv.s[i + 1] = 0.0;  // zero imaginary parts
@@ -852,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha
 inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
 inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
 
-
-
 // Check our vector types are of an appropriate size.
 #if defined QPX
 static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
@@ -868,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 /////////////////////////////////////////
 template <typename T>
 struct is_simd : public std::false_type {};
-template <>
-struct is_simd<vRealF> : public std::true_type {};
-template <>
-struct is_simd<vRealD> : public std::true_type {};
-template <>
-struct is_simd<vComplexF> : public std::true_type {};
-template <>
-struct is_simd<vComplexD> : public std::true_type {};
-template <>
-struct is_simd<vInteger> : public std::true_type {};
+template <> struct is_simd<vRealF>     : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
+template <> struct is_simd<vComplexF>  : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
+template <> struct is_simd<vInteger>   : public std::true_type {};
 
-template <typename T>
-using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
-template <typename T>
-using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 }
 
 #endif

From 07de925127e15fe7b43e31a9e9f3f2298f5f4261 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 28 Jun 2017 12:45:44 +0100
Subject: [PATCH 09/44] minor scalar action fixes

---
 lib/qcd/action/scalar/ScalarImpl.h              | 4 ++--
 lib/qcd/action/scalar/ScalarInteractionAction.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h
index 174553a2..f85ab840 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -124,11 +124,11 @@ class ScalarImplTypes {
     }
 
     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      Group::LieRandomize(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
     }
 
     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      Group::LieRandomize(pRNG, U, 0.01);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
     }
 
     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index ac2d4fbb..4d189352 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -81,7 +81,7 @@ namespace Grid {
       phiStencil.HaloExchange(p, compressor);
       Field action(p._grid), pshift(p._grid), phisquared(p._grid);
       phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared + lambda/24.*phisquared*phisquared;
+      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
       for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
@@ -113,7 +113,7 @@ namespace Grid {
 
     virtual void deriv(const Field &p, Field &force) {
       assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p + lambda/12.*p*p*p;
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
       // move this outside
       static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
       phiStencil.HaloExchange(p, compressor);

From 08e04b96761a03c703899a7ee6ca3f42dddcf2d2 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 28 Jun 2017 15:30:06 +0100
Subject: [PATCH 10/44] Better benchmarks

---
 benchmarks/Benchmark_memory_bandwidth.cc | 44 ++++++++++----------
 benchmarks/Benchmark_su3.cc              | 52 ++++++++++++------------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 1aa088f8..1136dfe0 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -55,9 +55,9 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=64;
-#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
-  for(int lat=4;lat<=lmax;lat+=4){
+  uint64_t lmax=96;
+#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
+  for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -65,11 +65,11 @@ int main (int argc, char ** argv)
 
       uint64_t Nloop=NLOOP;
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       double a=2.0;
 
 
@@ -94,17 +94,17 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
   
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       double a=2.0;
 
       uint64_t Nloop=NLOOP;
@@ -129,7 +129,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
 
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
@@ -138,11 +138,11 @@ int main (int argc, char ** argv)
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       RealD a=2.0;
 
 
@@ -166,17 +166,17 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
-      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
       RealD a=2.0;
       Real nn;      
       double start=usecond();
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 3d7f9bc9..035af2d9 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -37,12 +37,12 @@ int main (int argc, char ** argv)
   Grid_init(&argc,&argv);
 #define LMAX (64)
 
-  int Nloop=20;
+  int64_t Nloop=20;
 
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
-  int threads = GridThread::GetThreads();
+  int64_t threads = GridThread::GetThreads();
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -54,16 +54,16 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid);// random(pRNG,z);
-      LatticeColourMatrix x(&Grid);// random(pRNG,x);
-      LatticeColourMatrix y(&Grid);// random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	x=x*y;
       }
       double stop=usecond();
@@ -86,17 +86,17 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	z=x*y;
       }
       double stop=usecond();
@@ -117,17 +117,17 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
       }
       double stop=usecond();
@@ -148,17 +148,17 @@ int main (int argc, char ** argv)
   for(int lat=2;lat<=LMAX;lat+=2){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mac(z,x,y);
       }
       double stop=usecond();

From 0933aeefd4ad053015c4e6bf232df235c546047a Mon Sep 17 00:00:00 2001
From: Nils Meyer <nils.meyer@ur.de>
Date: Wed, 28 Jun 2017 20:22:22 +0200
Subject: [PATCH 11/44] corrected Grid_neon.h

---
 lib/simd/Grid_neon.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index f3f802e7..38815389 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -41,6 +41,10 @@ Author: neo <cossu@post.kek.jp>
 //#ifndef ARM_NEON
 //#define ARM_NEON
 
+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 16u
+#endif
+
 #include "Grid_generic_types.h"
 #include <arm_neon.h>
 

From 8859a151cc844b8170729285dc2a272e4ffa4940 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:30:29 +0100
Subject: [PATCH 12/44] Small corrections to the NEON port

---
 configure.ac                  |  2 +-
 lib/qcd/smearing/WilsonFlow.h |  9 ++++-----
 lib/simd/Grid_neon.h          | 15 +++++----------
 lib/simd/Grid_vector_types.h  |  2 +-
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/configure.ac b/configure.ac
index a69b97e3..75cf7891 100644
--- a/configure.ac
+++ b/configure.ac
@@ -250,7 +250,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       NEONv8)
         AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
-        SIMD_FLAGS='';;
+        SIMD_FLAGS='-march=armv8-a';;
       QPX|BGQ)
         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
         SIMD_FLAGS='';;
diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h
index 5e9f2d95..4f5c0d43 100644
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
     if (maxTau - taus < epsilon){
         epsilon = maxTau-taus;
     }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
     GaugeField Z(U._grid);
     GaugeField Zprime(U._grid);
     GaugeField tmp(U._grid), Uprime(U._grid);
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
     // adjust integration step
     
     taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
     
     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 
 }
 
@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
     out = in;
     for (unsigned int step = 1; step <= Nstep; step++) {
         auto start = std::chrono::high_resolution_clock::now();
-        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
         evolve_step(out);
         auto end = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> diff = end - start;
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
     unsigned int step = 0;
     do{
         step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
         evolve_step_adaptive(out, maxTau);
         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
             << step << "  "
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
index 38815389..d6eb9c5a 100644
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -6,9 +6,9 @@
 
     Copyright (C) 2015
 
-Author: Nils Meyer <nils.meyer@ur.de>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -27,7 +27,7 @@ Author: neo <cossu@post.kek.jp>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-//----------------------------------------------------------------------
+
 /*
 
   ARMv8 NEON intrinsics layer by
@@ -37,9 +37,6 @@ Author: neo <cossu@post.kek.jp>
   SFB/TRR55
 
 */
-//----------------------------------------------------------------------
-//#ifndef ARM_NEON
-//#define ARM_NEON
 
 #ifndef GEN_SIMD_WIDTH
 #define GEN_SIMD_WIDTH 16u
@@ -606,6 +603,4 @@ namespace Optimization {
   typedef Optimization::TimesMinusI TimesMinusISIMD;
   typedef Optimization::TimesI      TimesISIMD;
 
-}
-
-//#endif // ARM_NEON
+}
\ No newline at end of file
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index e05fecc4..27585547 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX

From bf630a6821ea8923fc9690a03f621f6d69b31f4e Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:42:25 +0100
Subject: [PATCH 13/44] README file update

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9432abe1..5d168298 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 License: GPL v2.
 
-Last update Nov 2016.
+Last update June 2017.
 
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 
@@ -78,14 +78,17 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
 
+### Required libraries
+Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
+
 ### Quick start
 First, start by cloning the repository:
 
@@ -173,7 +176,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | ARM NEON (128 bit)                     |
+| `QPX`       | IBM QPX (256 bit)                      |
 
 Alternatively, some CPU codenames can be directly used:
 

From 09d09d0fe5bce853e1b42115371cd935a4e29cc0 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 29 Jun 2017 11:48:11 +0100
Subject: [PATCH 14/44] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5d168298..1f0b450c 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 
 MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see https://arxiv.org/abs/1512.03487 for more detail.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 
 ### Required libraries
 Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
@@ -176,7 +176,7 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `NEONv8`    | ARM NEON (128 bit)                     |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 
 Alternatively, some CPU codenames can be directly used:
@@ -216,4 +216,4 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are w
              --with-mpfr=<path>       \
              --enable-mkl             \
              CXX=CC CC=cc
-```
\ No newline at end of file
+```

From ac1f1838bc9c143a3e2091e75d3f68e4455d0231 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:15:32 +0100
Subject: [PATCH 15/44] KNL only

---
 lib/perfmon/PerfCount.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/perfmon/PerfCount.cc b/lib/perfmon/PerfCount.cc
index 4778295a..c6f92b9f 100644
--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
     // 4
-#ifdef AVX512
+#ifdef KNL
     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
     { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },

From 2d3737a133b6f1208849cd8580badba4ff152a4d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:15:59 +0100
Subject: [PATCH 16/44] O3, KNL

---
 configure.ac | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index f7284d48..8175e8b0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -27,7 +27,7 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
       [version of g++ that will compile the code])
 
-CXXFLAGS="-g $CXXFLAGS"
+CXXFLAGS="-O3 $CXXFLAGS"
 
 
 ############### Checks for typedefs, structures, and compiler characteristics
@@ -241,6 +241,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       KNL)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        AC_DEFINE([KNL],[1],[Knights landing processor])
         SIMD_FLAGS='-march=knl';;
       GEN)
         AC_DEFINE([GEN],[1],[generic vector code])
@@ -276,6 +277,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='';;
       KNL)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+        AC_DEFINE([KNL],[1],[Knights landing processor])
         SIMD_FLAGS='-xmic-avx512';;
       GEN)
         AC_DEFINE([GEN],[1],[generic vector code])

From 694b305cab39e1b7870ca57107521679486c611a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:16:13 +0100
Subject: [PATCH 17/44] Update to reporting

---
 benchmarks/Benchmark_dwf.cc | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index a071c050..d50cc3a0 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -165,7 +165,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 
   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
+  int ncall =500;
   if (1) {
     FGrid->Barrier();
     Dw.ZeroCounters();
@@ -302,6 +302,7 @@ int main (int argc, char ** argv)
       std::cout<< "sD ERR   \n " << err  <<std::endl;
     }
     assert(sum < 1.0e-4);
+
     
     if(1){
       std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
@@ -381,8 +382,23 @@ int main (int argc, char ** argv)
       }
       assert(error<1.0e-4);
     }
+
+  if(1){
+    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
+    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      sDw.Dhop(ssrc,sresult,0);
+      PerformanceCounter Counter(i);
+      Counter.Start();
+      sDw.Dhop(ssrc,sresult,0);
+      Counter.Stop();
+      Counter.Report();
+    }
   }
 
+  }
+
+
+
   if (1)
   { // Naive wilson dag implementation
     ref = zero;

From b73bd151bb69a416dbf2fe455e87c1f85d5e59b3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:16:35 +0100
Subject: [PATCH 18/44] Switch off counters by default

---
 benchmarks/Benchmark_dwf.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index d50cc3a0..7814ec7d 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -383,7 +383,7 @@ int main (int argc, char ** argv)
       assert(error<1.0e-4);
     }
 
-  if(1){
+  if(0){
     std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
     for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
       sDw.Dhop(ssrc,sresult,0);

From 38325ebbc61cce2d0f6b394fd15d96333d6e370e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:23:51 +0100
Subject: [PATCH 19/44] Interleave code path; not enabled

---
 lib/stencil/Lebesgue.cc | 25 ++++++++++++++++++++++++-
 lib/stencil/Lebesgue.h  |  2 ++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc
index 4551878c..0c644fc1 100644
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -51,8 +51,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
   if ( Block[0]==0) ZGraph();
   else if ( Block[1]==0) NoBlocking();
   else CartesianBlocking();
-}
 
+  if (0) {
+    std::cout << "Thread Interleaving"<<std::endl;
+    ThreadInterleave();
+  } 
+}
+void LebesgueOrder::ThreadInterleave(void)
+{
+  std::vector<IndexInteger> reorder = _LebesgueReorder;
+  std::vector<IndexInteger> throrder;
+  int vol = _LebesgueReorder.size();
+  int threads = GridThread::GetThreads();
+  int blockbits=3;
+  int blocklen = 8;
+  int msk      = 0x7;
+
+  for(int t=0;t<threads;t++){
+    for(int ss=0;ss<vol;ss++){
+       if ( ( ss >> blockbits) % threads == t ) { 
+         throrder.push_back(reorder[ss]);
+       }
+    }
+  }
+  _LebesgueReorder = throrder;
+}
 void LebesgueOrder::NoBlocking(void) 
 {
   std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
diff --git a/lib/stencil/Lebesgue.h b/lib/stencil/Lebesgue.h
index 1675d16c..7db0cc6b 100644
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@@ -70,6 +70,8 @@ namespace Grid {
 		  std::vector<IndexInteger> & xi,
 		  std::vector<IndexInteger> &dims);
 
+    void ThreadInterleave(void);
+
   private:
     std::vector<IndexInteger> _LebesgueReorder;
 

From f20eceb6cd6469c496e07e01055a08c0e0e4f7c8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:48:27 +0100
Subject: [PATCH 20/44] First touch once per page in a threaded loop

---
 lib/allocator/AlignedAllocator.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 6e85ab27..54090024 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -98,7 +98,12 @@ public:
 #else
     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
-
+    // First touch optimise in threaded loop
+    uint8_t *cp = (uint8_t *)ptr;
+#pragma omp parallel for
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
     return ptr;
   }
 
@@ -186,6 +191,12 @@ public:
 #else
     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
+    size_type bytes = __n*sizeof(_Tp);
+    uint8_t *cp = (uint8_t *)ptr;
+#pragma omp parallel for
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
     return ptr;
   }
   void deallocate(pointer __p, size_type) { 

From 7a788db3dc07914e4e4daa0219edcd406ea6f35e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:49:08 +0100
Subject: [PATCH 21/44] Guard first touch

---
 lib/allocator/AlignedAllocator.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 54090024..4513ce26 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -100,7 +100,9 @@ public:
 #endif
     // First touch optimise in threaded loop
     uint8_t *cp = (uint8_t *)ptr;
+#ifdef GRID_OMP
 #pragma omp parallel for
+#endif
     for(size_type n=0;n<bytes;n+=4096){
       cp[n]=0;
     }

From b5a6e4f1fd7d562537b4914e4dc21e81448d4bc6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 30 Jun 2017 10:53:22 +0100
Subject: [PATCH 22/44] Best option for Xeon cache blocking set

---
 lib/stencil/Lebesgue.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc
index 0c644fc1..2880e4b6 100644
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 
 int LebesgueOrder::UseLebesgueOrder;
+#ifdef KNL
 std::vector<int> LebesgueOrder::Block({8,2,2,2});
-
+#else
+std::vector<int> LebesgueOrder::Block({2,2,2,2});
+#endif
 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
   n--;           // 1000 0011 --> 1000 0010
   n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011

From f3b0a92e71af2577afb68c3021b1f9a8467f3e8e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 09:48:00 +0100
Subject: [PATCH 23/44] Update README.md

---
 README.md | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 1f0b450c..072f7404 100644
--- a/README.md
+++ b/README.md
@@ -158,7 +158,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
-| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
@@ -199,21 +198,109 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=KNL        \
-             --enable-comms=mpi-auto \
-             --with-gmp=<path>        \
-             --with-mpfr=<path>       \
+             --enable-comms=mpi-auto  \
              --enable-mkl             \
              CXX=icpc MPICXX=mpiicpc
 ```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
-where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=KNL        \
              --enable-comms=mpi       \
-             --with-gmp=<path>        \
-             --with-mpfr=<path>       \
              --enable-mkl             \
              CXX=CC CC=cc
 ```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+```            --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Knight's Landing with Intel Omnipath adapters with two adapters per node 
+presently performs better with use of more than one rank per node, using shared memory 
+for interior communication. This is the mpi3 communications implementation. 
+We recommend four ranks per node for best performance, but optimum is local volume dependent.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+
+### Build setup for Intel Haswell Xeon platform
+
+The following configuration is recommended for the Intel Knights Landing platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+```            --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+        export I_MPI_PIN=1
+```
+This is the default.
+
+### Build setup for Intel Skylake Xeon platform
+
+The following configuration is recommended for the Intel Knights Landing platform:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+```            --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi3      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+        export I_MPI_PIN=1
+```
+This is the default. 
+
+

From e18929eaa0c8e6de539abf2c2ef259ea0816ea7e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 09:53:15 +0100
Subject: [PATCH 24/44] Update README.md

---
 README.md | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 072f7404..f3645b3a 100644
--- a/README.md
+++ b/README.md
@@ -215,7 +215,8 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl
 ```
 
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
-```            --with-gmp=<path>        \
+``` bash
+               --with-gmp=<path>        \
                --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
@@ -228,26 +229,27 @@ We recommend four ranks per node for best performance, but optimum is local volu
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=KNL        \
-             --enable-comms=mpi3      \
+             --enable-comms=mpi3-auto \
              --enable-mkl             \
-             CXX=mpiicpc
+             CC=icpc MPICXX=mpiicpc 
 ```
 
 ### Build setup for Intel Haswell Xeon platform
 
-The following configuration is recommended for the Intel Knights Landing platform:
+The following configuration is recommended for the Intel Haswell platform:
 
 ``` bash
 ../configure --enable-precision=double\
              --enable-simd=AVX2       \
-             --enable-comms=mpi3      \
+             --enable-comms=mpi3-auto \
              --enable-mkl             \
-             CXX=mpiicpc
+             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
-```            --with-gmp=<path>        \
+``` bash
+               --with-gmp=<path>        \
                --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
@@ -270,7 +272,7 @@ This is the default.
 
 ### Build setup for Intel Skylake Xeon platform
 
-The following configuration is recommended for the Intel Knights Landing platform:
+The following configuration is recommended for the Intel Skylake platform:
 
 ``` bash
 ../configure --enable-precision=double\
@@ -282,7 +284,8 @@ The following configuration is recommended for the Intel Knights Landing platfor
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
-```            --with-gmp=<path>        \
+``` bash
+               --with-gmp=<path>        \
                --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
@@ -298,7 +301,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
-```
+``` bash
         export I_MPI_PIN=1
 ```
 This is the default. 

From 251a97fe1be59f28686e1d07f8576c7d9f815517 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 09:55:36 +0100
Subject: [PATCH 25/44] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f3645b3a..f9fd7ab5 100644
--- a/README.md
+++ b/README.md
@@ -301,7 +301,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
-``` bash
+``` 
         export I_MPI_PIN=1
 ```
 This is the default. 

From 1354b46338bfaaa338e4e3ad7430e8b8fe087057 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:04:32 +0100
Subject: [PATCH 26/44] Update README.md

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index f9fd7ab5..8f0babd9 100644
--- a/README.md
+++ b/README.md
@@ -306,4 +306,20 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for laptops, other compilers, non-cluster builds
+
+Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
+and omit the enable-mkl flag. 
+
+Single node builds are enabled with 
+```
+            --enable-comms=none
+```
+
+FFTW support that is not in the default search path may then enabled with
+```
+    --with-fftw=<installpath>
+```
+
+BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
 

From 3d09e3e9e0c3b24e1646db3083aba01537bcf88a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:05:46 +0100
Subject: [PATCH 27/44] Update README.md

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 8f0babd9..8f7a3d42 100644
--- a/README.md
+++ b/README.md
@@ -306,6 +306,14 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for BlueGene/Q
+
+To be written...
+
+### Build setup for ARM Neon
+
+To be written..
+
 ### Build setup for laptops, other compilers, non-cluster builds
 
 Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),

From 37263fd9b181f1190ff201203da6ac6a431e045d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:06:24 +0100
Subject: [PATCH 28/44] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8f7a3d42..afb751f5 100644
--- a/README.md
+++ b/README.md
@@ -312,7 +312,7 @@ To be written...
 
 ### Build setup for ARM Neon
 
-To be written..
+To be written...
 
 ### Build setup for laptops, other compilers, non-cluster builds
 

From b68ad0cc0bf6ab479199020fd6b976229c0cb047 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:20:07 +0100
Subject: [PATCH 29/44] Update README.md

---
 README.md | 74 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index afb751f5..a786bc6c 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,37 @@ Last update June 2017.
 
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 
+
+
+### Description
+This library provides data parallel C++ container classes with internal memory layout
+that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
+are provided, similar to HPF and cmfortran, and user control is given over the mapping of
+array indices to both MPI tasks and SIMD processing elements.
+
+* Identically shaped arrays then be processed with perfect data parallelisation.
+* Such identically shaped arrays are called conformable arrays.
+
+The transformation is based on the observation that Cartesian array processing involves
+identical processing to be performed on different regions of the Cartesian array.
+
+The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+Local vector loops are parallelised with OpenMP pragmas.
+
+Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
+for most programmers.
+
+The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
+
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+
+MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
+
+
 ### Compilers
 
 Intel ICPC v16.0.3 and later
@@ -56,38 +87,19 @@ When you file an issue, please go though the following checklist:
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 
-
-
-### Description
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
-
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shaped arrays are called conformable arrays.
-
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
-
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
-Local vector loops are parallelised with OpenMP pragmas.
-
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
-
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
-
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
-The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
-
-MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
-
 ### Required libraries
-Grid requires [GMP](https://gmplib.org/), [MPFR](http://www.mpfr.org/) and optionally [HDF5](https://support.hdfgroup.org/HDF5/) and [LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) to be installed.
+Grid requires:
+[GMP](https://gmplib.org/), 
+[MPFR](http://www.mpfr.org/) 
+
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
+
+Grid optionally uses:
+[HDF5](https://support.hdfgroup.org/HDF5/)  
+[LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) 
+[FFTW](http://www.fftw.org) (Either generic or via the Intel MKL library)
+[LAPACK]( either generic or Intel MKL library)
+
 
 ### Quick start
 First, start by cloning the repository:

From 7b0237b0819d6981767a0189f7550546a58a8683 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sat, 1 Jul 2017 10:24:41 +0100
Subject: [PATCH 30/44] Update README.md

---
 README.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a786bc6c..3572be26 100644
--- a/README.md
+++ b/README.md
@@ -89,16 +89,22 @@ When you file an issue, please go though the following checklist:
 
 ### Required libraries
 Grid requires:
+
 [GMP](https://gmplib.org/), 
+
 [MPFR](http://www.mpfr.org/) 
 
 Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 
 Grid optionally uses:
+
 [HDF5](https://support.hdfgroup.org/HDF5/)  
-[LIME](http://usqcd-software.github.io/c-lime/) (for ILDG file format support) 
-[FFTW](http://www.fftw.org) (Either generic or via the Intel MKL library)
-[LAPACK]( either generic or Intel MKL library)
+
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
+
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
+
+LAPACK either generic version or Intel MKL library.
 
 
 ### Quick start

From 40e119c61cac619b7fa1874e5fa7ccdc1dcb77cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 8 Jul 2017 22:27:11 -0400
Subject: [PATCH 31/44] NUMA improvements worth preserving from AMD EPYC tests

---
 benchmarks/Benchmark_memory_bandwidth.cc | 48 ++++++++++++------------
 lib/allocator/AlignedAllocator.h         |  3 +-
 lib/communicator/Communicator_mpi3.cc    | 20 +++++++++-
 3 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 1136dfe0..848f271d 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -60,16 +60,16 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       uint64_t Nloop=NLOOP;
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       double a=2.0;
 
 
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000;
       
       double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
     }
@@ -97,14 +97,14 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       double a=2.0;
 
       uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000;
      
       double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
     }
@@ -133,16 +133,16 @@ int main (int argc, char ** argv)
 
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       RealD a=2.0;
 
 
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=2*vol*Nvec*sizeof(Real);
+      double bytes=2.0*vol*Nvec*sizeof(Real);
       double flops=vol*Nvec*1;// mul
       std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
@@ -169,14 +169,14 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       RealD a=2.0;
       Real nn;      
       double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=vol*Nvec*sizeof(Real);
+      double bytes=1.0*vol*Nvec*sizeof(Real);
       double flops=vol*Nvec*2;// mul,add
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
 
diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 4513ce26..db86c435 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -195,7 +195,8 @@ public:
 #endif
     size_type bytes = __n*sizeof(_Tp);
     uint8_t *cp = (uint8_t *)ptr;
-#pragma omp parallel for
+    // One touch per 4k page, static OMP loop to catch same loop order
+#pragma omp parallel for schedule(static)
     for(size_type n=0;n<bytes;n+=4096){
       cp[n]=0;
     }
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 632eb991..f5646d44 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -37,7 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
+#include <numaif.h>
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -214,6 +215,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
+
+	int status;
+	int flags=MPOL_MF_MOVE;
+#ifdef KNL
+	int nodes=1; // numa domain == MCDRAM
+	// Find out if in SNC2,SNC4 mode ?
+#else
+	int nodes=r; // numa domain == MPI ID
+#endif
+	unsigned long count=1;
+      for(uint64_t page=0;page<size;page+=4096){
+	void *pages = (void *) ( page + (uint64_t)ptr );
+	uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	if (ierr && (page==0)) perror("numa relocate command failed");
+      }
+
       ShmCommBufs[r] =ptr;
       
     }

From 8a4714a4a6ca1e9a613e097e892f9c78cb05c4e1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sun, 9 Jul 2017 00:11:54 +0100
Subject: [PATCH 32/44] Update README.md

---
 README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/README.md b/README.md
index 3572be26..e0a9bb14 100644
--- a/README.md
+++ b/README.md
@@ -324,6 +324,60 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for AMD EPYC / RYZEN
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank. 
+
+The following configuration is recommended for the AMD EPYC platform.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3 \
+             CXX=mpicxx 
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
+shared memory to communicate within this node:
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
+
+Where omp_bind.sh does the following:
+```
+#!/bin/bash
+
+numanode=` expr $PMI_RANK % 8 `
+basecore=`expr $numanode \* 16`
+core0=`expr $basecore + 0 `
+core1=`expr $basecore + 2 `
+core2=`expr $basecore + 4 `
+core3=`expr $basecore + 6 `
+core4=`expr $basecore + 8 `
+core5=`expr $basecore + 10 `
+core6=`expr $basecore + 12 `
+core7=`expr $basecore + 14 `
+
+export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+$@
+```
+
 ### Build setup for BlueGene/Q
 
 To be written...

From dc6f078246b006ad1b3e61c513273b73f8f0da81 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 11 Jul 2017 14:15:08 +0100
Subject: [PATCH 33/44] fixed the header file for mpi3

---
 configure.ac                          |  8 +++++++-
 lib/communicator/Communicator_mpi3.cc | 18 +++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/configure.ac b/configure.ac
index 8c43d67a..dc6754da 100644
--- a/configure.ac
+++ b/configure.ac
@@ -51,6 +51,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
+AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 
@@ -186,9 +187,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 
 AC_SEARCH_LIBS([crc32], [z],
                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
-               [have_zlib=true],
+               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 
+AC_SEARCH_LIBS([move_pages], [numa],
+               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
+               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
+	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
+
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
                [have_hdf5=true]
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index f5646d44..4192300b 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -38,7 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
+#ifdef HAVE_NUMAIF_H
 #include <numaif.h>
+#endif
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -216,6 +218,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
 
+      // Try to force numa domain on the shm segment if we have numaif.h
+#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
@@ -225,13 +229,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
-      for(uint64_t page=0;page<size;page+=4096){
-	void *pages = (void *) ( page + (uint64_t)ptr );
-	uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
-	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-	if (ierr && (page==0)) perror("numa relocate command failed");
-      }
-
+	for(uint64_t page=0;page<size;page+=4096){
+	  void *pages = (void *) ( page + (uint64_t)ptr );
+	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	  if (ierr && (page==0)) perror("numa relocate command failed");
+	}
+#endif
       ShmCommBufs[r] =ptr;
       
     }

From 659d7d1a4051a950d6a14bf7513a0847010de926 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Wed, 12 Jul 2017 15:01:48 +0100
Subject: [PATCH 34/44] For test/solver Fixed

---
 lib/lattice/Lattice_reduction.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index c5b20f3c..38982891 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -540,7 +540,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       for(int i=0;i<Nblock;i++){
       for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
-	vector_typeD rtmp = TensorRemove(tmp);
+	//	vector_typeD rtmp = TensorRemove(tmp);
+	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
       }}
     }}

From 2d8aff36fe46cd692634ceacbacf43308d204fb4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Fri, 14 Jul 2017 22:52:16 +0100
Subject: [PATCH 35/44] Update README.md

---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index e0a9bb14..ea20d0ec 100644
--- a/README.md
+++ b/README.md
@@ -324,6 +324,17 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+
+mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
+Average mflops/s per call per node (full): ** 498739 ** 4d vec
+Average mflops/s per call per node (full): ** 457786 ** 4d vec, fp16 comms
+Average mflops/s per call per node (full): ** 572645 ** 5d vec
+Average mflops/s per call per node (full): ** 721206 ** 5d vec, red black
+Average mflops/s per call per node (full): ** 634542 ** 4d vec, red black
+
+
+
 ### Build setup for AMD EPYC / RYZEN
 
 The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
@@ -378,6 +389,17 @@ echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 $@
 ```
 
+Performance:
+
+** Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+
+mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
+Average mflops/s per call per node (full): **420235** 4d vec
+Average mflops/s per call per node (full): **437617** 4d vec, fp16 comms
+Average mflops/s per call per node (full): **522988** 5d vec
+Average mflops/s per call per node (full): **588984** 5d vec, red black
+Average mflops/s per call per node (full): **508423** 4d vec, red black
+
 ### Build setup for BlueGene/Q
 
 To be written...

From 169f4b2711f0131f1909738c2b631ced3e47c9e1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Fri, 14 Jul 2017 22:56:06 +0100
Subject: [PATCH 36/44] Update README.md

---
 README.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index ea20d0ec..124c7bfa 100644
--- a/README.md
+++ b/README.md
@@ -327,11 +327,11 @@ This is the default.
 ** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
-Average mflops/s per call per node (full): ** 498739 ** 4d vec
-Average mflops/s per call per node (full): ** 457786 ** 4d vec, fp16 comms
-Average mflops/s per call per node (full): ** 572645 ** 5d vec
-Average mflops/s per call per node (full): ** 721206 ** 5d vec, red black
-Average mflops/s per call per node (full): ** 634542 ** 4d vec, red black
+- Average mflops/s per call per node (full):  498739 : 4d vec
+- Average mflops/s per call per node (full):  457786 : 4d vec, fp16 comms
+- Average mflops/s per call per node (full):  572645 : 5d vec
+- Average mflops/s per call per node (full):  721206 : 5d vec, red black
+- Average mflops/s per call per node (full):  634542 : 4d vec, red black
 
 
 
@@ -391,14 +391,14 @@ $@
 
 Performance:
 
-** Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+### Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
-Average mflops/s per call per node (full): **420235** 4d vec
-Average mflops/s per call per node (full): **437617** 4d vec, fp16 comms
-Average mflops/s per call per node (full): **522988** 5d vec
-Average mflops/s per call per node (full): **588984** 5d vec, red black
-Average mflops/s per call per node (full): **508423** 4d vec, red black
+- Average mflops/s per call per node (full): 420235 : 4d vec
+- Average mflops/s per call per node (full): 437617 : 4d vec, fp16 comms
+- Average mflops/s per call per node (full): 522988 : 5d vec
+- Average mflops/s per call per node (full): 588984 : 5d vec, red black
+- Average mflops/s per call per node (full): 508423 : 4d vec, red black
 
 ### Build setup for BlueGene/Q
 

From f038c6babe1ec5cd3772c4bcb892d19709dc96f5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Fri, 14 Jul 2017 22:59:16 +0100
Subject: [PATCH 37/44] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 124c7bfa..a185063e 100644
--- a/README.md
+++ b/README.md
@@ -324,7 +324,7 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
-** Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 - Average mflops/s per call per node (full):  498739 : 4d vec
@@ -391,7 +391,7 @@ $@
 
 Performance:
 
-### Expected EPYC 7601 Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): **
+#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 - Average mflops/s per call per node (full): 420235 : 4d vec

From fe4912880d3ceaf96023e5074682cc4ee43cb871 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Mon, 17 Jul 2017 09:53:07 +0100
Subject: [PATCH 38/44] Update README.md

---
 README.md | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a185063e..1e0988f3 100644
--- a/README.md
+++ b/README.md
@@ -327,12 +327,8 @@ This is the default.
 #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
-- Average mflops/s per call per node (full):  498739 : 4d vec
-- Average mflops/s per call per node (full):  457786 : 4d vec, fp16 comms
-- Average mflops/s per call per node (full):  572645 : 5d vec
-- Average mflops/s per call per node (full):  721206 : 5d vec, red black
-- Average mflops/s per call per node (full):  634542 : 4d vec, red black
 
+TBA
 
 
 ### Build setup for AMD EPYC / RYZEN
@@ -394,11 +390,8 @@ Performance:
 #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
-- Average mflops/s per call per node (full): 420235 : 4d vec
-- Average mflops/s per call per node (full): 437617 : 4d vec, fp16 comms
-- Average mflops/s per call per node (full): 522988 : 5d vec
-- Average mflops/s per call per node (full): 588984 : 5d vec, red black
-- Average mflops/s per call per node (full): 508423 : 4d vec, red black
+
+TBA
 
 ### Build setup for BlueGene/Q
 

From 0f214ad427c2f903bc5effeb453f5bed27034cc5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Fri, 21 Jul 2017 11:13:51 -0400
Subject: [PATCH 39/44] Moved FourierAcceleratedGaugeFixer into Grid::QCD
 namespace and removed 'using namespace' directives from header

---
 lib/qcd/utils/GaugeFix.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/utils/GaugeFix.h b/lib/qcd/utils/GaugeFix.h
index 4ff216e4..f2ea1aa2 100644
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,12 +26,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 //#include <Grid/Grid.h>
 
-using namespace Grid;
-using namespace Grid::QCD;
+namespace Grid {
+namespace QCD {
 
 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
-  public:
+ public:
   INHERIT_GIMPL_TYPES(Gimpl);
 
   typedef typename Gimpl::GaugeLinkField GaugeMat;
@@ -186,3 +186,5 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
   }  
 };
 
+}
+}

From 56967818626452a318c058684b9594adca4f7fa4 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 12:07:34 +0100
Subject: [PATCH 40/44] Debug error in Tensor mult

---
 lib/tensors/Tensor_arith_mul.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/tensors/Tensor_arith_mul.h b/lib/tensors/Tensor_arith_mul.h
index c24853b7..a474db9c 100644
--- a/lib/tensors/Tensor_arith_mul.h
+++ b/lib/tensors/Tensor_arith_mul.h
@@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
                  const iVector<vtype,N> * __restrict__ rhs,
                  const iScalar<mtype> * __restrict__ lhs){
-    mult(ret,lhs,rhs);
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
+    }                 
 }
     
 

From 237cfd11ab493e1ea8ffaf24fc1da5171b8b929a Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 12:08:51 +0100
Subject: [PATCH 41/44] Solving the spurious O2 flags

---
 configure.ac | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index dc6754da..a028fb0a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 
+################ Set flags
+# do not move!
+CXXFLAGS="-O3 $CXXFLAGS"
+
 ############### Checks for programs
 AC_PROG_CXX
 AC_PROG_RANLIB
@@ -27,7 +31,6 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
       [version of g++ that will compile the code])
 
-CXXFLAGS="-O3 $CXXFLAGS"
 
 
 ############### Checks for typedefs, structures, and compiler characteristics

From 7abc5613bde6fb4e704145b0f2a4c8fa19090944 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 16:21:17 +0100
Subject: [PATCH 42/44] Added smearing to the topological charge observable

---
 lib/qcd/modules/ObservableModules.h      | 15 ++---
 lib/qcd/observables/topological_charge.h | 70 +++++++++++++++++++++---
 tests/hmc/Test_hmc_WilsonGauge.cc        |  5 +-
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/lib/qcd/modules/ObservableModules.h b/lib/qcd/modules/ObservableModules.h
index 579fc1ec..24511617 100644
--- a/lib/qcd/modules/ObservableModules.h
+++ b/lib/qcd/modules/ObservableModules.h
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
   typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
   using ObsBase::ObsBase; // for constructors
 
-
-
   // acquire resource
   virtual void initialize(){
     this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
   PlaquetteMod(): ObsBase(NoParameters()){}
 };
 
+
 template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
+class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
+  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
   using ObsBase::ObsBase; // for constructors
 
-
-
   // acquire resource
   virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
+    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
   }
   public:
-  TopologicalChargeMod(): ObsBase(NoParameters()){}
+  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
+  TopologicalChargeMod(): ObsBase(){}
 };
 
 
-
 }// QCD temporarily here
 
 
diff --git a/lib/qcd/observables/topological_charge.h b/lib/qcd/observables/topological_charge.h
index 5d09c420..c2c419fb 100644
--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@@ -33,9 +33,45 @@ directory
 namespace Grid {
 namespace QCD {
 
+struct TopologySmearingParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
+    int, steps,
+    float, step_size,
+    int, meas_interval,
+    float, maxTau);
+
+    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
+        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
+
+    template < class ReaderClass >
+    TopologySmearingParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "Smearing", *this);  
+    }  
+};
+
+
+
+struct TopologyObsParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
+      int, interval,
+      bool, do_smearing,
+      TopologySmearingParameters, Smearing);  
+
+    TopologyObsParameters(int interval = 1, bool smearing = false):
+        interval(interval), Smearing(smearing){}
+
+    template <class ReaderClass >
+      TopologyObsParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "TopologyMeasurement", *this);
+  }
+};
+
+
 // this is only defined for a gauge theory
 template <class Impl>
 class TopologicalCharge : public HmcObservable<typename Impl::Field> {
+    TopologyObsParameters Pars;
+
  public:
     // here forces the Impl to be of gauge fields
     // if not the compiler will complain
@@ -44,20 +80,40 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
     // necessary for HmcObservable compatibility
     typedef typename Impl::Field Field;
 
+    TopologicalCharge(int interval = 1, bool do_smearing = false):
+        Pars(interval, do_smearing){}
+    
+    TopologicalCharge(TopologyObsParameters P):Pars(P){
+        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
+    }
+
     void TrajectoryComplete(int traj,
                             Field &U,
                             GridSerialRNG &sRNG,
                             GridParallelRNG &pRNG) {
 
-    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
+    if (traj%Pars.interval == 0){
+        // Smearing
+        Field Usmear = U;
+        int def_prec = std::cout.precision();
+        
+        if (Pars.do_smearing){
+            // using wilson flow by default here
+            std::cout << "1. " << Pars.Smearing.step_size << std::endl;
+            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
+            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
+            Real T0   = WF.energyDensityPlaquette(Usmear);
+            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
+        }
 
-    int def_prec = std::cout.precision();
+        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
+        std::cout << GridLogMessage
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
 
-    std::cout << GridLogMessage
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-
-    std::cout.precision(def_prec);
+        std::cout.precision(def_prec);
+        }
     }
 
 };
diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc
index b2d5fb02..4cf6d923 100644
--- a/tests/hmc/Test_hmc_WilsonGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonGauge.cc
@@ -66,7 +66,10 @@ int main(int argc, char **argv) {
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
-  TheHMC.Resources.AddObservable<QObs>();
+  TopologyObsParameters TopParams;
+  TopParams.interval = 1;
+  TopParams.do_smearing = false; 
+  TheHMC.Resources.AddObservable<QObs>(TopParams);
   //////////////////////////////////////////////
 
   /////////////////////////////////////////////////////////////

From c0485d799d915637fdc455dfa900ee9786f7cd69 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 26 Jul 2017 16:26:04 +0100
Subject: [PATCH 43/44] Explicit parameter declaration in the WilsonGauge test

---
 lib/qcd/observables/topological_charge.h | 1 -
 tests/hmc/Test_hmc_WilsonGauge.cc        | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/observables/topological_charge.h b/lib/qcd/observables/topological_charge.h
index c2c419fb..5af8d77b 100644
--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@@ -99,7 +99,6 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
         
         if (Pars.do_smearing){
             // using wilson flow by default here
-            std::cout << "1. " << Pars.Smearing.step_size << std::endl;
             WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
             WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
             Real T0   = WF.energyDensityPlaquette(Usmear);
diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc
index 4cf6d923..05bf81a2 100644
--- a/tests/hmc/Test_hmc_WilsonGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonGauge.cc
@@ -67,8 +67,12 @@ int main(int argc, char **argv) {
   typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   TopologyObsParameters TopParams;
-  TopParams.interval = 1;
-  TopParams.do_smearing = false; 
+  TopParams.interval = 5;
+  TopParams.do_smearing = true;
+  TopParams.Smearing.steps = 200;
+  TopParams.Smearing.step_size = 0.01;
+  TopParams.Smearing.meas_interval = 50;
+  TopParams.Smearing.maxTau = 2.0; 
   TheHMC.Resources.AddObservable<QObs>(TopParams);
   //////////////////////////////////////////////
 

From c7036f671754710c41de00cb0fa90a6e35104467 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 27 Jul 2017 11:15:09 +0100
Subject: [PATCH 44/44] Adding checks for libm and libstdc++

---
 configure.ac | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/configure.ac b/configure.ac
index a028fb0a..bf078b13 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,6 +58,10 @@ AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 
+############## Standard libraries
+AC_CHECK_LIB([m],[cos])
+AC_CHECK_LIB([stdc++],[abort])
+
 ############### GMP and MPFR
 AC_ARG_WITH([gmp],
     [AS_HELP_STRING([--with-gmp=prefix],