first commit for QPX intrinsics

2025-12-22 05:34:30 +00:00 · 2016-08-23 14:41:44 +01:00
parent 88be3b39bb
commit 4d11a6f5f2
2 changed files with 198 additions and 183 deletions
--- a/configure.ac
+++ b/configure.ac
@@ -125,11 +125,14 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
      IMCI|KNC)
-        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
    esac;;
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -1,300 +1,312 @@
-    /*************************************************************************************
+/*******************************************************************************
-
+ 
-    Grid physics library, www.github.com/paboyle/Grid 
+ Grid physics library, www.github.com/paboyle/Grid
-
+ 
-    Source file: ./lib/simd/Grid_qpx.h
+ Source file: ./lib/simd/Grid_qpx.h
-
+ 
-    Copyright (C) 2015
+ Copyright (C) 2016
-
+ 
-Author: neo <cossu@post.kek.jp>
+ Author: Antonin Portelli <antonin.portelli@me.com>
-
+ 
-    This program is free software; you can redistribute it and/or modify
+ This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+ it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+ the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+ (at your option) any later version.
-
+ 
-    This program is distributed in the hope that it will be useful,
+ This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+ GNU General Public License for more details.
-
+ 
-    You should have received a copy of the GNU General Public License along
+ You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+ with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+ 
-    See the full license in the file "LICENSE" in the top level distribution directory
+ See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+ ******************************************************************************/
    /*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_qpx.h
  @brief Optimization libraries for QPX instructions set for BG/Q
  Using intrinsics
 */
 // Time-stamp: <2015-05-27 11:30:21 neo>
 //----------------------------------------------------------------------
 // lot of undefined functions
 namespace Grid {
 namespace Optimization {
  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
  {
    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
    return stream;
  };
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
+    inline vector4double operator()(float a, float b){
-      return {a,b,a,b};
+      return (vector4double){(double)a, (double)b, (double)a, (double)b};
    }
    // Real float
-    inline float operator()(float a){
+    inline vector4double operator()(float a){
-      return {a,a,a,a};
+      return (vector4double){(double)a, (double)a, (double)a, (double)a};
    }
    //Complex double
    inline vector4double operator()(double a, double b){
-      return {a,b,a,b};
+      return (vector4double){a, b, a, b};
    }
    //Real double
    inline vector4double operator()(double a){
-      return {a,a,a,a};
+      return (vector4double){a, a, a, a};
    }
    //Integer
    inline int operator()(Integer a){
-#error
+      return a;
    }
  };
-
+  
  struct Vstore{
-    //Float 
+    //Float
-    inline void operator()(float a, float* F){
+    inline void operator()(vector4double a, float *f){
-      assert(0);
+      vec_st(a, 0, f);
    }
    //Double
-    inline void operator()(vector4double a, double* D){
+    inline void operator()(vector4double a, double *d){
-      assert(0);
+      vec_st(a, 0, d);
    }
    //Integer
-    inline void operator()(int a, Integer* I){
+    inline void operator()(int a, Integer *i){
-      assert(0);
+      i[0] = a;
    }
  };
-
+  
  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
+    inline void operator()(float *f, vector4double a){
-      assert(0);
+      vec_st(a, 0, f);
    }
    //Double
-    inline void operator()(double * a, vector4double b){
+    inline void operator()(double *d, vector4double a){
-      assert(0);
+      vec_st(a, 0, d);
    }
  };
-
+  
  struct Vset{
-    // Complex float 
+    // Complex float
-    inline float operator()(Grid::ComplexF *a){
+    inline vector4double operator()(Grid::ComplexF *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return vec_ld(0, (float *)a);
    }
-    // Complex double 
+    // Complex double
    inline vector4double operator()(Grid::ComplexD *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return vec_ld(0, (double *)a);
    }
-    // Real float 
+    // Real float
-    inline float operator()(float *a){
+    inline vector4double operator()(float *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+      return vec_ld(0, a);
    }
    // Real double
    inline vector4double operator()(double *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+      return vec_ld(0, a);
    }
    // Integer
    inline int operator()(Integer *a){
-#error
+      return a[0];
    }
-
+    
-
+    
  };
-
+  
  template <typename Out_type, typename In_type>
-    struct Reduce{
+  struct Reduce{
-      //Need templated class to overload output type
+    //Need templated class to overload output type
-      //General form must generate error if compiled
+    //General form must generate error if compiled
-      inline Out_type operator()(In_type in){
+    inline Out_type operator()(In_type in){
-	printf("Error, using wrong Reduce function\n");
+      printf("Error, using wrong Reduce function\n");
-	exit(1);
+      exit(1);
-	return 0;
+      return 0;
-      }
+    }
-    };
+  };
-
+  
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
    inline float operator()(float a, float b){
 #error
    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-      return vec_add(a,b);
+      return vec_add(a, b);
    }
    //Integer
    inline int operator()(int a, int b){
-#error
+      return a + b;
    }
  };
-
+  
  struct Sub{
    //Complex/Real float
    inline float operator()(float a, float b){
 #error
    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_sub(a, b);
    }
    //Integer
-    inline floati operator()(int a, int b){
+    inline int operator()(int a, int b){
-#error
+      return a - b;
    }
  };
-
+  
  struct MultComplex{
    // Complex float
    inline float operator()(float a, float b){
 #error
    }
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_xxnpmadd(a, b, vec_xmul(b, a));
    }
  };
-
+  
  struct Mult{
    // Real float
    inline float operator()(float a, float b){
 #error
    }
    // Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_mul(a, b);
    }
    // Integer
    inline int operator()(int a, int b){
-#error
+      return a*b;
    }
  };
-
+  
  struct Conj{
    // Complex single
    inline float operator()(float in){
      assert(0);
    }
    // Complex double
-    inline vector4double operator()(vector4double in){
+    inline vector4double operator()(vector4double v){
-      assert(0);
+      return vec_mul(v, (vector4double){1., -1., 1., -1.});
    }
    // do not define for integer input
  };
-
+  
  struct TimesMinusI{
    //Complex single
    inline float operator()(float in, float ret){
      assert(0);
    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
+    inline vector4double operator()(vector4double v, vector4double ret){
-      assert(0);
+      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
                               (vector4double){0., 0., 0., 0.});
    }
  };
-
+  
  struct TimesI{
    //Complex single
    inline float operator()(float in, float ret){
    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
+    inline vector4double operator()(vector4double v, vector4double ret){
-  
+      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
                              (vector4double){0., 0., 0., 0.});
    }
  };
-
+  struct Permute{
    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
      return vec_perm(v, v, vec_gpci(02301));
    };
    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
      return vec_perm(v, v, vec_gpci(01032));
    };
    static inline vector4double Permute2(vector4double v){
      return v;
    };
    static inline vector4double Permute3(vector4double v){
      return v;
    };
  };
-
+  struct Rotate{
-
+    static inline vector4double rotate(vector4double v, int n){
-  //////////////////////////////////////////////
+      switch(n){
-  // Some Template specialization
+        case 0:
          return v;
          break;
        case 1:
          return vec_perm(v, v, vec_gpci(01230));
          break;
        case 2:
          return vec_perm(v, v, vec_gpci(02301));
          break;
        case 3:
          return vec_perm(v, v, vec_gpci(03012));
          break;
        default: assert(0);
      }
    }
  };
  //Complex float Reduce
  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
+  inline Grid::ComplexF
-    assert(0);
+  Reduce<Grid::ComplexF, vector4double>::operator()(vector4double v) { //2 complex
    vector4double v1,v2;
    v1 = Optimization::Permute::Permute0(v);
    v1 = vec_add(v1, v);
    return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1));
  }
  //Real float Reduce
  template<>
-    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
+  inline Grid::RealF
-    assert(0);
+  Reduce<Grid::RealF, vector4double>::operator()(vector4double v){ //4 floats
    vector4double v1,v2;
    v1 = Optimization::Permute::Permute0(v);
    v1 = vec_add(v1, v);
    v2 = Optimization::Permute::Permute1(v1);
    v1 = vec_add(v1, v2);
    return (float)vec_extract(v1, 0);
  }
  //Complex double Reduce
  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
+  inline Grid::ComplexD
-    assert(0);
+  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
    vector4double v1;
    v1 = Optimization::Permute::Permute0(v);
    v1 = vec_add(v1, v);
    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
  }
  //Real double Reduce
  template<>
-    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
+  inline Grid::RealD
-    assert(0);
+  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
-  }
+    vector4double v1,v2;
    v1 = Optimization::Permute::Permute0(v);
    v1 = vec_add(v1, v);
    v2 = Optimization::Permute::Permute1(v1);
    v1 = vec_add(v1, v2);
    return vec_extract(v1, 0);
  }
  //Integer Reduce
  template<>
-    inline Integer Reduce<Integer, floati>::operator()(float in){
+  inline Integer Reduce<Integer, int>::operator()(int in){
    // FIXME unimplemented
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
-//////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
+// Here assign types
 namespace Grid {
  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
  typedef vector4double SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype;           // Integer type
-  inline void v_prefetch0(int size, const char *ptr){};
+typedef vector4double SIMD_Ftype;  // Single precision type
 typedef vector4double SIMD_Dtype; // Double precision type
 typedef int SIMD_Itype; // Integer type
-  // Function name aliases
+// prefetch utilities
-  typedef Optimization::Vsplat   VsplatSIMD;
+inline void v_prefetch0(int size, const char *ptr){};
-  typedef Optimization::Vstore   VstoreSIMD;
+inline void prefetch_HINT_T0(const char *ptr){};
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
-  // Arithmetic operations
+// Function name aliases
-  typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Mult        MultSIMD;
+typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
-  typedef Optimization::Conj        ConjSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 // Arithmetic operations
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
 typedef Optimization::TimesI      TimesISIMD;
 }