Merge branch 'develop' into feature/bgq-asm

2025-11-03 13:34:33 +00:00 · 2017-03-13 11:10:10 +00:00
parent b64e004555 dfefc70b57
commit 4ed10a3d06
5 changed files with 250 additions and 97 deletions
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@@ -5,8 +5,10 @@
    Source file: ./lib/simd/Grid_generic.h

    Copyright (C) 2015
+    Copyright (C) 2017

 Author: Antonin Portelli <antonin.portelli@me.com>
+        Andrew Lawson    <andrew.lawson1991@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,51 +28,10 @@ Author: Antonin Portelli <antonin.portelli@me.com>
    *************************************************************************************/
    /*  END LEGAL */

-static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
-
-//#define VECTOR_LOOPS
-
-// playing with compiler pragmas
-#ifdef VECTOR_LOOPS
-#ifdef __clang__
-#define VECTOR_FOR(i, w, inc)\
-_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
-for (unsigned int i = 0; i < w; i += inc)
-#elif defined __INTEL_COMPILER
-#define VECTOR_FOR(i, w, inc)\
-_Pragma("simd vectorlength(w*8)")\
-for (unsigned int i = 0; i < w; i += inc)
-#else
-#define VECTOR_FOR(i, w, inc)\
-for (unsigned int i = 0; i < w; i += inc)
-#endif
-#else
-#define VECTOR_FOR(i, w, inc)\
-for (unsigned int i = 0; i < w; i += inc)
-#endif
+#include "Grid_generic_types.h"

 namespace Grid {
 namespace Optimization {
-
-  // type traits giving the number of elements for each vector type
-  template <typename T> struct W;
-  template <> struct W<double> {
-    constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
-    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
-  };
-  template <> struct W<float> {
-    constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
-    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
-  };
-  
-  // SIMD vector types
-  template <typename T>
-  struct vec {
-    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
-  };
-  
-  typedef vec<float>   vecf;
-  typedef vec<double>  vecd;
  
  struct Vsplat{
    // Complex
@@ -99,11 +60,6 @@ namespace Optimization {
      
      return out;
    }
-    
-    // Integer
-    inline int operator()(Integer a){
-      return a;
-    }
  };

  struct Vstore{
@@ -112,11 +68,6 @@ namespace Optimization {
    inline void operator()(vec<T> a, T *D){
      *((vec<T> *)D) = a;
    }
-    //Integer
-    inline void operator()(int a, Integer *I){
-      *I = a;
-    }
-
  };

  struct Vstream{
@@ -151,11 +102,6 @@ namespace Optimization {
      
      return out;
    }
-
-    // Integer
-    inline int operator()(Integer *a){
-      return *a;
-    }
  };

  /////////////////////////////////////////////////////
@@ -174,11 +120,6 @@ namespace Optimization {
      
      return out;
    }
-    
-    //I nteger
-    inline int operator()(int a, int b){
-      return a + b;
-    }
  };

  struct Sub{
@@ -194,11 +135,6 @@ namespace Optimization {
      
      return out;
    }
-    
-    //Integer
-    inline int operator()(int a, int b){
-      return a-b;
-    }
  };

  struct Mult{
@@ -214,11 +150,6 @@ namespace Optimization {
      
      return out;
    }
-    
-    // Integer
-    inline int operator()(int a, int b){
-      return a*b;
-    }
  };
  
  #define cmul(a, b, c, i)\
@@ -232,13 +163,26 @@ namespace Optimization {
      
      VECTOR_FOR(i, W<T>::c, 1)
      {
-         out.v[2*i]   = a[2*i]*b[2*i];
-         out.v[2*i+1] = a[2*i]*b[2*i+1];
+         out.v[2*i]   = a.v[2*i]*b.v[2*i];
+         out.v[2*i+1] = a.v[2*i]*b.v[2*i+1];
      }      
      return out;
-    };
+    }
  };

+  struct MaddRealPart{
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b, vec<T> c){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+         out.v[2*i]   = a.v[2*i]*b.v[2*i] + c.v[2*i];
+         out.v[2*i+1] = a.v[2*i]*b.v[2*i+1] + c.v[2*i+1];
+      }      
+      return out;
+    }
+  };
  
  struct MultComplex{
    // Complex
@@ -369,6 +313,11 @@ namespace Optimization {
  }
  
  struct Rotate{
+      
+    template <int n, typename T> static inline vec<T> tRotate(vec<T> in){
+      return rotate(in, n);
+    }
+    
    template <typename T>
    static inline vec<T> rotate(vec<T> in, int n){
      vec<T> out;
@@ -442,8 +391,12 @@ namespace Optimization {

  //Integer Reduce
  template<>
-  inline Integer Reduce<Integer, int>::operator()(int in){
-    return in;
+  inline Integer Reduce<Integer, veci>::operator()(veci in){
+    Integer a = 0;
+    
+    acc(in.v, a, 0, 1, W<Integer>::r);
+    
+    return a;
  }
 }

@@ -452,7 +405,7 @@ namespace Optimization {

  typedef Optimization::vecf SIMD_Ftype; // Single precision type
  typedef Optimization::vecd SIMD_Dtype; // Double precision type
-  typedef int SIMD_Itype; // Integer type
+  typedef Optimization::veci SIMD_Itype; // Integer type

  // prefetch utilities
  inline void v_prefetch0(int size, const char *ptr){};
@@ -472,6 +425,7 @@ namespace Optimization {
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_generic_types.h
+++ b/lib/simd/Grid_generic_types.h
@@ -0,0 +1,80 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_generic_types.h
+
+    Copyright (C) 2017
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+        Andrew Lawson    <andrew.lawson1991@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
+
+//#define VECTOR_LOOPS
+
+// playing with compiler pragmas
+#ifdef VECTOR_LOOPS
+#ifdef __clang__
+#define VECTOR_FOR(i, w, inc)\
+_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
+for (unsigned int i = 0; i < w; i += inc)
+#elif defined __INTEL_COMPILER
+#define VECTOR_FOR(i, w, inc)\
+_Pragma("simd vectorlength(w*8)")\
+for (unsigned int i = 0; i < w; i += inc)
+#else
+#define VECTOR_FOR(i, w, inc)\
+for (unsigned int i = 0; i < w; i += inc)
+#endif
+#else
+#define VECTOR_FOR(i, w, inc)\
+for (unsigned int i = 0; i < w; i += inc)
+#endif
+
+namespace Grid {
+namespace Optimization {
+
+  // type traits giving the number of elements for each vector type
+  template <typename T> struct W;
+  template <> struct W<double> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
+  };
+  template <> struct W<float> {
+    constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
+  };
+  template <> struct W<Integer> {
+    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
+  };
+  
+  // SIMD vector types
+  template <typename T>
+  struct vec {
+    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  };
+
+  typedef vec<float>   vecf;
+  typedef vec<double>  vecd;
+  typedef vec<Integer> veci;
+  
+}}
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -5,8 +5,10 @@
 Source file: ./lib/simd/Grid_qpx.h
 
 Copyright (C) 2016
+ Copyright (C) 2017
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+         Andrew Lawson    <andrew.lawson1991@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -25,6 +27,11 @@
 See the full license in the file "LICENSE" in the top level distribution directory
 ******************************************************************************/

+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 32u
+#endif
+#include "Grid_generic_types.h" // Definitions for simulated integer SIMD.
+
 namespace Grid {
 namespace Optimization {
  typedef struct 
@@ -62,8 +69,15 @@ namespace Optimization {
      return (vector4double){a, a, a, a};
    }
    //Integer
-    inline int operator()(Integer a){
-      return a;
+    inline veci operator()(Integer a){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a;
+      }
+      
+      return out;
    }
  };
  
@@ -88,10 +102,10 @@ namespace Optimization {
    inline void operator()(vector4double a, double *d){
      vec_st(a, 0, d);
    }
+
    //Integer
-    // PAB: fixme -- is this right ; just looks like scalar not vector
-    inline void operator()(int a, Integer *i){
-      i[0] = a;
+    inline void operator()(veci a, Integer *i){
+      *((veci *)i) = a;
    }
  };
  
@@ -143,11 +157,13 @@ namespace Optimization {
      return vec_ld(0, a);
    }
    // Integer
-    inline int operator()(Integer *a){
-      return a[0];
-    }
-    
-    
+    inline veci operator()(Integer *a){
+      veci out;
+      
+      out = *((veci *)a);
+      
+      return out;
+    }    
  };
  
  template <typename Out_type, typename In_type>
@@ -217,8 +233,15 @@ namespace Optimization {
    FLOAT_WRAP_2(operator(), inline)

    //Integer
-    inline int operator()(int a, int b){
-      return a + b;
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i] + b.v[i];
+      }
+      
+      return out;
    }
  };
  
@@ -232,8 +255,15 @@ namespace Optimization {
    FLOAT_WRAP_2(operator(), inline)

    //Integer
-    inline int operator()(int a, int b){
-      return a - b;
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i] - b.v[i];
+      }
+      
+      return out;
    }
  };
  
@@ -272,8 +302,15 @@ namespace Optimization {
    FLOAT_WRAP_2(operator(), inline)

    // Integer
-    inline int operator()(int a, int b){
-      return a*b;
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i]*b.v[i];
+      }
+      
+      return out;
    }
  };

@@ -287,8 +324,15 @@ namespace Optimization {
    FLOAT_WRAP_2(operator(), inline)

    // Integer
-    inline int operator()(int a, int b){
-      return a/b;
+    inline veci operator()(veci a, veci b){
+      veci out;
+      
+      VECTOR_FOR(i, W<Integer>::r, 1)
+      {
+        out.v[i] = a.v[i]/b.v[i];
+      }
+      
+      return out;
    }
  };

@@ -457,7 +501,7 @@ namespace Optimization {
 // Here assign types
 typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
 typedef vector4double              SIMD_Dtype; // Double precision type
-typedef int                        SIMD_Itype; // Integer type
+typedef Optimization::veci         SIMD_Itype; // Integer type

 // prefetch utilities
 inline void v_prefetch0(int size, const char *ptr){};
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -758,6 +758,15 @@ typedef Grid_simd<std::complex<float>, SIMD_Ftype> vComplexF;
 typedef Grid_simd<std::complex<double>, SIMD_Dtype> vComplexD;
 typedef Grid_simd<Integer, SIMD_Itype> vInteger;

+// Check our vector types are of an appropriate size.
+#if defined QPX
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#else
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
+static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths incorrect");
+#endif
+
 /////////////////////////////////////////
 // Some traits to recognise the types
 /////////////////////////////////////////
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -178,6 +178,65 @@ void Tester(const functor &func)
  assert(ok==0);
 }

+template<class functor>
+void IntTester(const functor &func)
+{
+  typedef Integer  scal;
+  typedef vInteger vec;
+  GridSerialRNG          sRNG;
+  sRNG.SeedRandomDevice();
+
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  std::vector<scal> result(Nsimd);
+  std::vector<scal> reference(Nsimd);
+
+  std::vector<vec,alignedAllocator<vec> > buf(3);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+  vec & v_result = buf[2];
+
+
+  for(int i=0;i<Nsimd;i++){
+    input1[i] = (i + 1) * 30;
+    input2[i] = (i + 1) * 20;
+    result[i] = (i + 1) * 10;
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+  merge<vec,scal>(v_result,result);
+
+  func(v_result,v_input1,v_input2);
+
+  for(int i=0;i<Nsimd;i++) {
+    func(reference[i],input1[i],input2[i]);
+  }
+
+  extract<vec,scal>(v_result,result);
+
+  std::cout << GridLogMessage << " " << func.name() << std::endl;
+
+  std::cout << GridLogDebug << v_input1 << std::endl;
+  std::cout << GridLogDebug << v_input2 << std::endl;
+  std::cout << GridLogDebug << v_result << std::endl;
+
+  int ok=0;
+  for(int i=0;i<Nsimd;i++){
+    if ( reference[i]-result[i] != 0){
+      std::cout<<GridLogMessage<< "*****" << std::endl;
+      std::cout<<GridLogMessage<< "["<<i<<"] "<< reference[i]-result[i] << " " <<reference[i]<< " " << result[i]<<std::endl;
+      ok++;
+    }
+  }
+  if ( ok==0 ) {
+    std::cout<<GridLogMessage << " OK!" <<std::endl;
+  }
+  assert(ok==0);
+}
+

 template<class reduced,class scal, class vec,class functor > 
 void ReductionTester(const functor &func)
@@ -611,6 +670,13 @@ int main (int argc, char ** argv)
  for(int r=0;r<vComplexD::Nsimd();r++){
    PermTester<ComplexD,vComplexD>(funcRotate(r));
  }
+  
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vInteger                   "<<  std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  IntTester(funcPlus());
+  IntTester(funcMinus());
+  IntTester(funcTimes());

  Grid_finalize();
 }