QPX single precision implementation

2025-12-21 13:14:29 +00:00 · 2016-09-19 18:09:12 +01:00
parent 2e74520821
commit 0724f7af75
3 changed files with 132 additions and 24 deletions
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -265,7 +265,7 @@
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
       }
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
+	 //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -27,20 +27,31 @@

 namespace Grid {
 namespace Optimization {
+  typedef struct 
+  {
+    float v0,v1,v2,v3;
+  } vector4float;
+
  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
  {
    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
    return stream;
  };
+
+  inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
+  {
+    stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
+    return stream;
+  };
  
  struct Vsplat{
    //Complex float
-    inline vector4double operator()(float a, float b){
-      return (vector4double){(double)a, (double)b, (double)a, (double)b};
+    inline vector4float operator()(float a, float b){
+      return (vector4float){a, b, a, b};
    }
    // Real float
-    inline vector4double operator()(float a){
-      return (vector4double){(double)a, (double)a, (double)a, (double)a};
+    inline vector4float operator()(float a){
+      return (vector4float){a, a, a, a};
    }
    //Complex double
    inline vector4double operator()(double a, double b){
@@ -61,6 +72,18 @@ namespace Optimization {
    inline void operator()(vector4double a, float *f){
      vec_st(a, 0, f);
    }
+
+    inline void operator()(vector4double a, vector4float &f){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(vector4float a, float *f){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
    //Double
    inline void operator()(vector4double a, double *d){
      vec_st(a, 0, d);
@@ -76,6 +99,18 @@ namespace Optimization {
    inline void operator()(float *f, vector4double a){
      vec_st(a, 0, f);
    }
+
+    inline void operator()(vector4float f, vector4double a){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(float *f, vector4float a){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
    //Double
    inline void operator()(double *d, vector4double a){
      vec_st(a, 0, d);
@@ -85,17 +120,23 @@ namespace Optimization {
  
  struct Vset{
    // Complex float
-    inline vector4double operator()(Grid::ComplexF *a){
-      return vec_ld(0, (float *)a);
+    inline vector4float operator()(Grid::ComplexF *a){
+      return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
    }
    // Complex double
    inline vector4double operator()(Grid::ComplexD *a){
      return vec_ld(0, (double *)a);
    }
+
    // Real float
-    inline vector4double operator()(float *a){
-      return vec_ld(0, a);
+    inline vector4float operator()(float *a){
+      return (vector4float){a[0], a[1], a[2], a[3]};
    }
+
+    inline vector4double operator()(vector4float a){
+      return vec_ld(0, (float *)(&a));
+    }
+
    // Real double
    inline vector4double operator()(double *a){
      return vec_ld(0, a);
@@ -122,11 +163,42 @@ namespace Optimization {
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
+  #define FLOAT_WRAP_2(fn, pref)\
+  pref vector4float fn(vector4float a, vector4float b)\
+  {\
+    vector4double ad, bd, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    bd = Vset()(b);\
+    rd = fn(ad, bd);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
+  #define FLOAT_WRAP_1(fn, pref)\
+  pref vector4float fn(vector4float a)\
+  {\
+    vector4double ad, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    rd = fn(ad);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
  struct Sum{
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
      return vec_add(a, b);
    }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
    //Integer
    inline int operator()(int a, int b){
      return a + b;
@@ -138,6 +210,10 @@ namespace Optimization {
    inline vector4double operator()(vector4double a, vector4double b){
      return vec_sub(a, b);
    }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
    //Integer
    inline int operator()(int a, int b){
      return a - b;
@@ -149,6 +225,9 @@ namespace Optimization {
    inline vector4double operator()(vector4double a, vector4double b){
      return vec_xxnpmadd(a, b, vec_xmul(b, a));
    }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
  };
  
  struct Mult{
@@ -156,6 +235,10 @@ namespace Optimization {
    inline vector4double operator()(vector4double a, vector4double b){
      return vec_mul(a, b);
    }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
    // Integer
    inline int operator()(int a, int b){
      return a*b;
@@ -167,6 +250,9 @@ namespace Optimization {
    inline vector4double operator()(vector4double v){
      return vec_mul(v, (vector4double){1., -1., 1., -1.});
    }
+
+    // Complex float
+    FLOAT_WRAP_1(operator(), inline)
  };
  
  struct TimesMinusI{
@@ -175,6 +261,9 @@ namespace Optimization {
      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
                               (vector4double){0., 0., 0., 0.});
    }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
  };
  
  struct TimesI{
@@ -183,9 +272,13 @@ namespace Optimization {
      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
                              (vector4double){0., 0., 0., 0.});
    }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
  };

  struct Permute{
+    //Complex double
    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
      return vec_perm(v, v, vec_gpci(02301));
    };
@@ -198,6 +291,12 @@ namespace Optimization {
    static inline vector4double Permute3(vector4double v){
      return v;
    };
+
+    // Complex float
+    FLOAT_WRAP_1(Permute0, static inline)
+    FLOAT_WRAP_1(Permute1, static inline)
+    FLOAT_WRAP_1(Permute2, static inline)
+    FLOAT_WRAP_1(Permute3, static inline)
  };
  
  struct Rotate{
@@ -218,31 +317,42 @@ namespace Optimization {
        default: assert(0);
      }
    }
+
+    static inline vector4float rotate(vector4float v, int n){
+      vector4double vd, rd;
+      vector4float  r;
+
+      vd = Vset()(v);
+      rd = rotate(vd, n);
+      Vstore()(rd, r);
+
+      return r;
+    }
  };
  
  //Complex float Reduce
  template<>
  inline Grid::ComplexF
-  Reduce<Grid::ComplexF, vector4double>::operator()(vector4double v) { //2 complex
-    vector4double v1,v2;
+  Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
+    vector4float v1,v2;
    
    v1 = Optimization::Permute::Permute0(v);
-    v1 = vec_add(v1, v);
+    v1 = Optimization::Sum()(v1, v);
    
-    return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1));
+    return Grid::ComplexF(v1.v0, v1.v1);
  }
  //Real float Reduce
  template<>
  inline Grid::RealF
-  Reduce<Grid::RealF, vector4double>::operator()(vector4double v){ //4 floats
-    vector4double v1,v2;
+  Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
+    vector4float v1,v2;
    
    v1 = Optimization::Permute::Permute0(v);
-    v1 = vec_add(v1, v);
+    v1 = Optimization::Sum()(v1, v);
    v2 = Optimization::Permute::Permute1(v1);
-    v1 = vec_add(v1, v2);
+    v1 = Optimization::Sum()(v1, v2);
    
-    return (float)vec_extract(v1, 0);
+    return v1.v0;
  }
  
  
@@ -283,10 +393,9 @@ namespace Optimization {

 ////////////////////////////////////////////////////////////////////////////////
 // Here assign types
-
-typedef vector4double SIMD_Ftype;  // Single precision type
-typedef vector4double SIMD_Dtype; // Double precision type
-typedef int SIMD_Itype; // Integer type
+typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
+typedef vector4double              SIMD_Dtype; // Double precision type
+typedef int                        SIMD_Itype; // Integer type

 // prefetch utilities
 inline void v_prefetch0(int size, const char *ptr){};
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -157,10 +157,9 @@ void Tester(const functor &func)
  std::cout << GridLogMessage << " " << func.name() << std::endl;

  std::cout << GridLogDebug << v_input1 << std::endl;
+  std::cout << GridLogDebug << v_input2 << std::endl;
  std::cout << GridLogDebug << v_result << std::endl;

-
-
  int ok=0;
  for(int i=0;i<Nsimd;i++){
    if ( abs(reference[i]-result[i])>1.0e-7){