NAMESPACE and format

2025-07-12 19:27:05 +01:00 · 2018-01-12 18:27:22 +00:00
parent 00c49d4c17
commit d8ff895e74
1 changed files with 533 additions and 541 deletions
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -25,8 +25,8 @@ Author: neo <cossu@post.kek.jp>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
  @brief Optimization libraries for SSE4 instructions set
@ -38,25 +38,25 @@ Author: neo <cossu@post.kek.jp>

 #include <pmmintrin.h>

-namespace Grid {
-namespace Optimization {
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Optimization);

-  template<class vtype>
-  union uconv {
+template<class vtype>
+union uconv {
  __m128 f;
  vtype v;
-  };
+};

-  union u128f {
+union u128f {
  __m128 v;
  float f[4];
-  };
-  union u128d {
+};
+union u128d {
  __m128d v;
  double f[2];
-  };
+};
  
-  struct Vsplat{
+struct Vsplat{
  //Complex float
  inline __m128 operator()(float a, float b){
    return _mm_set_ps(b,a,b,a);
@ -77,9 +77,9 @@ namespace Optimization {
  inline __m128i operator()(Integer a){
    return _mm_set1_epi32(a);
  }
-  };
+};

-  struct Vstore{
+struct Vstore{
  //Float 
  inline void operator()(__m128 a, float* F){
    _mm_store_ps(F,a);
@ -93,9 +93,9 @@ namespace Optimization {
    _mm_store_si128((__m128i *)I,a);
  }

-  };
+};

-  struct Vstream{
+struct Vstream{
  //Float
  inline void operator()(float * a, __m128 b){
    _mm_stream_ps(a,b);
@ -106,9 +106,9 @@ namespace Optimization {
  }


-  };
+};

-  struct Vset{
+struct Vset{
  // Complex float 
  inline __m128 operator()(Grid::ComplexF *a){
    return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
@ -131,10 +131,10 @@ namespace Optimization {
  }


-  };
+};

-  template <typename Out_type, typename In_type>
-  struct Reduce{
+template <typename Out_type, typename In_type>
+struct Reduce{
  //Need templated class to overload output type
  //General form must generate error if compiled
  inline Out_type operator()(In_type in){
@ -142,12 +142,12 @@ namespace Optimization {
    exit(1);
    return 0;
  }
-  };
+};

-  /////////////////////////////////////////////////////
-  // Arithmetic operations
-  /////////////////////////////////////////////////////
-  struct Sum{
+/////////////////////////////////////////////////////
+// Arithmetic operations
+/////////////////////////////////////////////////////
+struct Sum{
  //Complex/Real float
  inline __m128 operator()(__m128 a, __m128 b){
    return _mm_add_ps(a,b);
@ -160,9 +160,9 @@ namespace Optimization {
  inline __m128i operator()(__m128i a, __m128i b){
    return _mm_add_epi32(a,b);
  }
-  };
+};

-  struct Sub{
+struct Sub{
  //Complex/Real float
  inline __m128 operator()(__m128 a, __m128 b){
    return _mm_sub_ps(a,b);
@ -175,9 +175,9 @@ namespace Optimization {
  inline __m128i operator()(__m128i a, __m128i b){
    return _mm_sub_epi32(a,b);
  }
-  };
+};

-  struct MultRealPart{
+struct MultRealPart{
  inline __m128 operator()(__m128 a, __m128 b){
    __m128 ymm0;
    ymm0  = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
@ -188,8 +188,8 @@ namespace Optimization {
    ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
    return _mm_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
  }
-  };
-  struct MaddRealPart{
+};
+struct MaddRealPart{
  inline __m128 operator()(__m128 a, __m128 b, __m128 c){
    __m128 ymm0 =  _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
    return _mm_add_ps(_mm_mul_ps( ymm0, b),c);                         
@ -198,9 +198,9 @@ namespace Optimization {
    __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
    return _mm_add_pd(_mm_mul_pd( ymm0, b),c);                         
  }
-  };
+};

-  struct MultComplex{
+struct MultComplex{
  // Complex float
  inline __m128 operator()(__m128 a, __m128 b){
    __m128 ymm0,ymm1,ymm2;
@ -221,9 +221,9 @@ namespace Optimization {
    ymm1 = _mm_mul_pd(ymm1,ymm2);     // ymm1 <- br ai, ai bi
    return _mm_addsub_pd(ymm0,ymm1);  
  }
-  };
+};

-  struct Mult{
+struct Mult{

  inline void mac(__m128 &a, __m128 b, __m128 c){
    a= _mm_add_ps(_mm_mul_ps(b,c),a);
@ -245,9 +245,9 @@ namespace Optimization {
  inline __m128i operator()(__m128i a, __m128i b){
    return _mm_mullo_epi32(a,b);
  }
-  };
+};

-  struct Div{
+struct Div{
  // Real float
  inline __m128 operator()(__m128 a, __m128 b){
    return _mm_div_ps(a,b);
@ -256,10 +256,10 @@ namespace Optimization {
  inline __m128d operator()(__m128d a, __m128d b){
    return _mm_div_pd(a,b);
  }
-  };
+};


-  struct Conj{
+struct Conj{
  // Complex single
  inline __m128 operator()(__m128 in){
    return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
@ -269,9 +269,9 @@ namespace Optimization {
    return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
  }
  // do not define for integer input
-  };
+};

-  struct TimesMinusI{
+struct TimesMinusI{
  //Complex single
  inline __m128 operator()(__m128 in, __m128 ret){
    __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
@ -284,9 +284,9 @@ namespace Optimization {
  }


-  };
+};

-  struct TimesI{
+struct TimesI{
  //Complex single
  inline __m128 operator()(__m128 in, __m128 ret){
    __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
@ -297,9 +297,9 @@ namespace Optimization {
    __m128d tmp = _mm_shuffle_pd(in,in,0x1);
    return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
  }
-  };
+};

-  struct Permute{
+struct Permute{

  static inline __m128 Permute0(__m128 in){
    return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
@ -326,26 +326,25 @@ namespace Optimization {
  static inline __m128d Permute3(__m128d in){
    return in;
  };
-  };
-
+};
  
 #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)

 #ifdef SFW_FP16

-  struct Grid_half {
+struct Grid_half {
  Grid_half(){}
  Grid_half(uint16_t raw) : x(raw) {}
  uint16_t x;
-  };
-  union FP32 {
+};
+union FP32 {
  unsigned int u;
  float f;
-  };
+};

-  // PAB - Lifted and adapted from Eigen, which is GPL V2
-  inline float sfw_half_to_float(Grid_half h) {
+// PAB - Lifted and adapted from Eigen, which is GPL V2
+inline float sfw_half_to_float(Grid_half h) {
  const FP32 magic = { 113 << 23 };
  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
  FP32 o;
@ -361,8 +360,8 @@ namespace Optimization {
  }
  o.u |= (h.x & 0x8000) << 16;    // sign bit
  return o.f;
-  }
-  inline Grid_half sfw_float_to_half(float ff) {
+}
+inline Grid_half sfw_float_to_half(float ff) {
  FP32 f; f.f = ff;
  const FP32 f32infty = { 255 << 23 };
  const FP32 f16max = { (127 + 16) << 23 };
@ -400,8 +399,8 @@ namespace Optimization {
  } 
  o.x |= static_cast<unsigned short>(sign >> 16);
  return o;
-  }
-  static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
+}
+static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
  __m128i ret=(__m128i)_mm_setzero_ps();
  float *fp = (float *)&f;
  Grid_half *hp = (Grid_half *)&ret;
@ -410,8 +409,8 @@ namespace Optimization {
  hp[2] = sfw_float_to_half(fp[2]);
  hp[3] = sfw_float_to_half(fp[3]);
  return ret;
-  }
-  static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
+}
+static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
  __m128 ret=_mm_setzero_ps();
  float *fp = (float *)&ret;
  Grid_half  *hp = (Grid_half *)&h;
@ -420,12 +419,12 @@ namespace Optimization {
  fp[2] = sfw_half_to_float(hp[2]);
  fp[3] = sfw_half_to_float(hp[3]);
  return ret;
-  }
+}
 #else 
 #define Grid_mm_cvtps_ph _mm_cvtps_ph
 #define Grid_mm_cvtph_ps _mm_cvtph_ps
 #endif
-  struct PrecisionChange {
+struct PrecisionChange {
  static inline __m128i StoH (__m128 a,__m128 b) {
    __m128i ha = Grid_mm_cvtps_ph(a,0);
    __m128i hb = Grid_mm_cvtps_ph(b,0);
@ -460,9 +459,9 @@ namespace Optimization {
    StoD(sa,a,b);
    StoD(sb,c,d);
  }
-  };
+};

-  struct Exchange{
+struct Exchange{
  // 3210 ordering
  static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
    out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
@ -499,9 +498,9 @@ namespace Optimization {
    assert(0);
    return;
  };
-  };
+};

-  struct Rotate{
+struct Rotate{

  static inline __m128 rotate(__m128 in,int n){ 
    switch(n){
@ -523,23 +522,23 @@ namespace Optimization {
  template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); };
  template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); };

-  };
-  //////////////////////////////////////////////
-  // Some Template specialization
+};
+//////////////////////////////////////////////
+// Some Template specialization


-  //Complex float Reduce
-  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
+//Complex float Reduce
+template<>
+inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
  __m128 v1; // two complex
  v1= Optimization::Permute::Permute0(in); 
  v1= _mm_add_ps(v1,in);
  u128f conv;    conv.v=v1;
  return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
-  //Real float Reduce
-  template<>
-  inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
+}
+//Real float Reduce
+template<>
+inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
  __m128 v1,v2; // quad single
  v1= Optimization::Permute::Permute0(in); 
  v1= _mm_add_ps(v1,in);
@ -547,71 +546,64 @@ namespace Optimization {
  v1 = _mm_add_ps(v1,v2);
  u128f conv; conv.v=v1;
  return conv.f[0];
-  }
+}
  
-  
-  //Complex double Reduce
-  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
+//Complex double Reduce
+template<>
+inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
  u128d conv; conv.v = in;
  return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
+}
  
-  //Real double Reduce
-  template<>
-  inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
+//Real double Reduce
+template<>
+inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
  __m128d v1;
  v1 = Optimization::Permute::Permute0(in); 
  v1 = _mm_add_pd(v1,in);
  u128d conv; conv.v = v1;
  return conv.f[0];
-  }
+}

-  //Integer Reduce
-  template<>
-  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
+//Integer Reduce
+template<>
+inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
  __m128i v1 = _mm_hadd_epi32(in, in);
  __m128i v2 = _mm_hadd_epi32(v1, v1);
  return _mm_cvtsi128_si32(v2);
-  }
 }
-
-
+NAMESPACE_END(Optimization);

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
+typedef __m128i SIMD_Htype;  // Single precision type
+typedef __m128  SIMD_Ftype;  // Single precision type
+typedef __m128d SIMD_Dtype; // Double precision type
+typedef __m128i SIMD_Itype; // Integer type

-  typedef __m128i SIMD_Htype;  // Single precision type
-  typedef __m128  SIMD_Ftype;  // Single precision type
-  typedef __m128d SIMD_Dtype; // Double precision type
-  typedef __m128i SIMD_Itype; // Integer type
-
-  // prefetch utilities
-  inline void v_prefetch0(int size, const char *ptr){};
-  inline void prefetch_HINT_T0(const char *ptr){
+// prefetch utilities
+inline void v_prefetch0(int size, const char *ptr){};
+inline void prefetch_HINT_T0(const char *ptr){
  _mm_prefetch(ptr,_MM_HINT_T0);
-  }
-
-  // Function name aliases
-  typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
-
- 
-
-
-  // Arithmetic operations
-  typedef Optimization::Sum         SumSIMD;
-  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Div         DivSIMD;
-  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
-  typedef Optimization::MultRealPart MultRealPartSIMD;
-  typedef Optimization::MaddRealPart MaddRealPartSIMD;
-  typedef Optimization::Conj        ConjSIMD;
-  typedef Optimization::TimesMinusI TimesMinusISIMD;
-  typedef Optimization::TimesI      TimesISIMD;
-
 }
+
+// Function name aliases
+typedef Optimization::Vsplat   VsplatSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
+typedef Optimization::Vset     VsetSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+// Arithmetic operations
+typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Div         DivSIMD;
+typedef Optimization::Mult        MultSIMD;
+typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::MultRealPart MultRealPartSIMD;
+typedef Optimization::MaddRealPart MaddRealPartSIMD;
+typedef Optimization::Conj        ConjSIMD;
+typedef Optimization::TimesMinusI TimesMinusISIMD;
+typedef Optimization::TimesI      TimesISIMD;
+
+NAMESPACE_END(Grid);