first commit for QPX intrinsics

2025-11-04 05:54:32 +00:00 · 2016-08-23 14:41:44 +01:00
parent 88be3b39bb
commit 4d11a6f5f2
2 changed files with 198 additions and 183 deletions
--- a/configure.ac
+++ b/configure.ac
@@ -125,11 +125,14 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
      IMCI|KNC)
-        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
+      QPX|BGQ)
+        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
+        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
    esac;;
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -1,300 +1,312 @@
-    /*************************************************************************************
+/*******************************************************************************
 
-    Grid physics library, www.github.com/paboyle/Grid 
+ Grid physics library, www.github.com/paboyle/Grid
 
-    Source file: ./lib/simd/Grid_qpx.h
+ Source file: ./lib/simd/Grid_qpx.h
 
-    Copyright (C) 2015
+ Copyright (C) 2016
 
-Author: neo <cossu@post.kek.jp>
+ Author: Antonin Portelli <antonin.portelli@me.com>
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
 
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_qpx.h
-  @brief Optimization libraries for QPX instructions set for BG/Q
-
-  Using intrinsics
-*/
-// Time-stamp: <2015-05-27 11:30:21 neo>
-//----------------------------------------------------------------------
-
-// lot of undefined functions
+ See the full license in the file "LICENSE" in the top level distribution directory
+ ******************************************************************************/

+namespace Grid {
 namespace Optimization {
+  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
+  {
+    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
+    return stream;
+  };
  
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
-      return {a,b,a,b};
+    inline vector4double operator()(float a, float b){
+      return (vector4double){(double)a, (double)b, (double)a, (double)b};
    }
    // Real float
-    inline float operator()(float a){
-      return {a,a,a,a};
+    inline vector4double operator()(float a){
+      return (vector4double){(double)a, (double)a, (double)a, (double)a};
    }
    //Complex double
    inline vector4double operator()(double a, double b){
-      return {a,b,a,b};
+      return (vector4double){a, b, a, b};
    }
    //Real double
    inline vector4double operator()(double a){
-      return {a,a,a,a};
+      return (vector4double){a, a, a, a};
    }
    //Integer
    inline int operator()(Integer a){
-#error
+      return a;
    }
  };
  
  struct Vstore{
    //Float
-    inline void operator()(float a, float* F){
-      assert(0);
+    inline void operator()(vector4double a, float *f){
+      vec_st(a, 0, f);
    }
    //Double
-    inline void operator()(vector4double a, double* D){
-      assert(0);
+    inline void operator()(vector4double a, double *d){
+      vec_st(a, 0, d);
    }
    //Integer
-    inline void operator()(int a, Integer* I){
-      assert(0);
+    inline void operator()(int a, Integer *i){
+      i[0] = a;
    }
-
  };
  
-
  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
-      assert(0);
+    inline void operator()(float *f, vector4double a){
+      vec_st(a, 0, f);
    }
    //Double
-    inline void operator()(double * a, vector4double b){
-      assert(0);
+    inline void operator()(double *d, vector4double a){
+      vec_st(a, 0, d);
    }

-
  };
  
-
-
  struct Vset{
    // Complex float
-    inline float operator()(Grid::ComplexF *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+    inline vector4double operator()(Grid::ComplexF *a){
+      return vec_ld(0, (float *)a);
    }
    // Complex double
    inline vector4double operator()(Grid::ComplexD *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return vec_ld(0, (double *)a);
    }
    // Real float
-    inline float operator()(float *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+    inline vector4double operator()(float *a){
+      return vec_ld(0, a);
    }
    // Real double
    inline vector4double operator()(double *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+      return vec_ld(0, a);
    }
    // Integer
    inline int operator()(Integer *a){
-#error
+      return a[0];
    }
    
    
  };
  
  template <typename Out_type, typename In_type>
-    struct Reduce{
-      //Need templated class to overload output type
-      //General form must generate error if compiled
-      inline Out_type operator()(In_type in){
-	printf("Error, using wrong Reduce function\n");
-	exit(1);
-	return 0;
-      }
-    };
-
-
- 
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
  
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  struct Sum{
-    //Complex/Real float
-    inline float operator()(float a, float b){
-#error
-    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-      return vec_add(a,b);
+      return vec_add(a, b);
    }
    //Integer
    inline int operator()(int a, int b){
-#error
+      return a + b;
    }
  };
  
  struct Sub{
-    //Complex/Real float
-    inline float operator()(float a, float b){
-#error
-    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_sub(a, b);
    }
    //Integer
-    inline floati operator()(int a, int b){
-#error
+    inline int operator()(int a, int b){
+      return a - b;
    }
  };
  
-
  struct MultComplex{
-    // Complex float
-    inline float operator()(float a, float b){
-#error
-    }
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_xxnpmadd(a, b, vec_xmul(b, a));
    }
  };
  
  struct Mult{
-    // Real float
-    inline float operator()(float a, float b){
-#error
-    }
    // Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_mul(a, b);
    }
    // Integer
    inline int operator()(int a, int b){
-#error
+      return a*b;
    }
  };
  
-
  struct Conj{
-    // Complex single
-    inline float operator()(float in){
-      assert(0);
-    }
    // Complex double
-    inline vector4double operator()(vector4double in){
-      assert(0);
+    inline vector4double operator()(vector4double v){
+      return vec_mul(v, (vector4double){1., -1., 1., -1.});
    }
-    // do not define for integer input
  };
  
  struct TimesMinusI{
-    //Complex single
-    inline float operator()(float in, float ret){
-      assert(0);
-    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
-      assert(0);
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
+                               (vector4double){0., 0., 0., 0.});
    }
-
-
  };
  
  struct TimesI{
-    //Complex single
-    inline float operator()(float in, float ret){
-  
-    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
-  
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
+                              (vector4double){0., 0., 0., 0.});
    }
-
-
  };

+  struct Permute{
+    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
+      return vec_perm(v, v, vec_gpci(02301));
+    };
+    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
+      return vec_perm(v, v, vec_gpci(01032));
+    };
+    static inline vector4double Permute2(vector4double v){
+      return v;
+    };
+    static inline vector4double Permute3(vector4double v){
+      return v;
+    };
+  };
  
-  
-
-
-  //////////////////////////////////////////////
-  // Some Template specialization
+  struct Rotate{
+    static inline vector4double rotate(vector4double v, int n){
+      switch(n){
+        case 0:
+          return v;
+          break;
+        case 1:
+          return vec_perm(v, v, vec_gpci(01230));
+          break;
+        case 2:
+          return vec_perm(v, v, vec_gpci(02301));
+          break;
+        case 3:
+          return vec_perm(v, v, vec_gpci(03012));
+          break;
+        default: assert(0);
+      }
+    }
+  };
  
  //Complex float Reduce
  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
-    assert(0);
+  inline Grid::ComplexF
+  Reduce<Grid::ComplexF, vector4double>::operator()(vector4double v) { //2 complex
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    
+    return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1));
  }
  //Real float Reduce
  template<>
-    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
-    assert(0);
+  inline Grid::RealF
+  Reduce<Grid::RealF, vector4double>::operator()(vector4double v){ //4 floats
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = vec_add(v1, v2);
+    
+    return (float)vec_extract(v1, 0);
  }
  
  
  //Complex double Reduce
  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
-    assert(0);
+  inline Grid::ComplexD
+  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
+    vector4double v1;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    
+    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
  }
  
  //Real double Reduce
  template<>
-    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
-    assert(0);
+  inline Grid::RealD
+  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = vec_add(v1, v2);
+
+    return vec_extract(v1, 0);
  }
  
  //Integer Reduce
  template<>
-    inline Integer Reduce<Integer, floati>::operator()(float in){
+  inline Integer Reduce<Integer, int>::operator()(int in){
+    // FIXME unimplemented
+    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
-  
-  
 }

-//////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // Here assign types
-namespace Grid {
-  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
-  typedef vector4double SIMD_Dtype; // Double precision type
-  typedef int SIMD_Itype;           // Integer type

-  inline void v_prefetch0(int size, const char *ptr){};
+typedef vector4double SIMD_Ftype;  // Single precision type
+typedef vector4double SIMD_Dtype; // Double precision type
+typedef int SIMD_Itype; // Integer type

-  // Function name aliases
-  typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+// prefetch utilities
+inline void v_prefetch0(int size, const char *ptr){};
+inline void prefetch_HINT_T0(const char *ptr){};


-  // Arithmetic operations
-  typedef Optimization::Sum         SumSIMD;
-  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
-  typedef Optimization::Conj        ConjSIMD;
-  typedef Optimization::TimesMinusI TimesMinusISIMD;
-  typedef Optimization::TimesI      TimesISIMD;
+// Function name aliases
+typedef Optimization::Vsplat   VsplatSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
+typedef Optimization::Vset     VsetSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+// Arithmetic operations
+typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Mult        MultSIMD;
+typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::Conj        ConjSIMD;
+typedef Optimization::TimesMinusI TimesMinusISIMD;
+typedef Optimization::TimesI      TimesISIMD;
  
 }