NAMESPACE and format

2025-09-19 01:31:04 +01:00 · 2018-01-12 18:27:22 +00:00
parent 00c49d4c17
commit d8ff895e74
1 changed files with 533 additions and 541 deletions
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -25,8 +25,8 @@ Author: neo <cossu@post.kek.jp>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+*************************************************************************************/
-    /*  END LEGAL */
+/*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
  @brief Optimization libraries for SSE4 instructions set
@@ -38,25 +38,25 @@ Author: neo <cossu@post.kek.jp>
 #include <pmmintrin.h>
-namespace Grid {
+NAMESPACE_BEGIN(Grid);
-namespace Optimization {
+NAMESPACE_BEGIN(Optimization);
-  template<class vtype>
+template<class vtype>
-  union uconv {
+union uconv {
  __m128 f;
  vtype v;
-  };
+};
-  union u128f {
+union u128f {
  __m128 v;
  float f[4];
-  };
+};
-  union u128d {
+union u128d {
  __m128d v;
  double f[2];
-  };
+};
-  struct Vsplat{
+struct Vsplat{
  //Complex float
  inline __m128 operator()(float a, float b){
    return _mm_set_ps(b,a,b,a);
@@ -77,9 +77,9 @@ namespace Optimization {
  inline __m128i operator()(Integer a){
    return _mm_set1_epi32(a);
  }
-  };
+};
-  struct Vstore{
+struct Vstore{
  //Float 
  inline void operator()(__m128 a, float* F){
    _mm_store_ps(F,a);
@@ -93,9 +93,9 @@ namespace Optimization {
    _mm_store_si128((__m128i *)I,a);
  }
-  };
+};
-  struct Vstream{
+struct Vstream{
  //Float
  inline void operator()(float * a, __m128 b){
    _mm_stream_ps(a,b);
@@ -106,9 +106,9 @@ namespace Optimization {
  }
-  };
+};
-  struct Vset{
+struct Vset{
  // Complex float 
  inline __m128 operator()(Grid::ComplexF *a){
    return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
@@ -131,10 +131,10 @@ namespace Optimization {
  }
-  };
+};
-  template <typename Out_type, typename In_type>
+template <typename Out_type, typename In_type>
-  struct Reduce{
+struct Reduce{
  //Need templated class to overload output type
  //General form must generate error if compiled
  inline Out_type operator()(In_type in){
@@ -142,12 +142,12 @@ namespace Optimization {
    exit(1);
    return 0;
  }
-  };
+};
-  /////////////////////////////////////////////////////
+/////////////////////////////////////////////////////
-  // Arithmetic operations
+// Arithmetic operations
-  /////////////////////////////////////////////////////
+/////////////////////////////////////////////////////
-  struct Sum{
+struct Sum{
  //Complex/Real float
  inline __m128 operator()(__m128 a, __m128 b){
    return _mm_add_ps(a,b);
@@ -160,9 +160,9 @@ namespace Optimization {
  inline __m128i operator()(__m128i a, __m128i b){
    return _mm_add_epi32(a,b);
  }
-  };
+};
-  struct Sub{
+struct Sub{
  //Complex/Real float
  inline __m128 operator()(__m128 a, __m128 b){
    return _mm_sub_ps(a,b);
@@ -175,9 +175,9 @@ namespace Optimization {
  inline __m128i operator()(__m128i a, __m128i b){
    return _mm_sub_epi32(a,b);
  }
-  };
+};
-  struct MultRealPart{
+struct MultRealPart{
  inline __m128 operator()(__m128 a, __m128 b){
    __m128 ymm0;
    ymm0  = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
@@ -188,8 +188,8 @@ namespace Optimization {
    ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
    return _mm_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
  }
-  };
+};
-  struct MaddRealPart{
+struct MaddRealPart{
  inline __m128 operator()(__m128 a, __m128 b, __m128 c){
    __m128 ymm0 =  _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
    return _mm_add_ps(_mm_mul_ps( ymm0, b),c);                         
@@ -198,9 +198,9 @@ namespace Optimization {
    __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
    return _mm_add_pd(_mm_mul_pd( ymm0, b),c);                         
  }
-  };
+};
-  struct MultComplex{
+struct MultComplex{
  // Complex float
  inline __m128 operator()(__m128 a, __m128 b){
    __m128 ymm0,ymm1,ymm2;
@@ -221,9 +221,9 @@ namespace Optimization {
    ymm1 = _mm_mul_pd(ymm1,ymm2);     // ymm1 <- br ai, ai bi
    return _mm_addsub_pd(ymm0,ymm1);  
  }
-  };
+};
-  struct Mult{
+struct Mult{
  inline void mac(__m128 &a, __m128 b, __m128 c){
    a= _mm_add_ps(_mm_mul_ps(b,c),a);
@@ -245,9 +245,9 @@ namespace Optimization {
  inline __m128i operator()(__m128i a, __m128i b){
    return _mm_mullo_epi32(a,b);
  }
-  };
+};
-  struct Div{
+struct Div{
  // Real float
  inline __m128 operator()(__m128 a, __m128 b){
    return _mm_div_ps(a,b);
@@ -256,10 +256,10 @@ namespace Optimization {
  inline __m128d operator()(__m128d a, __m128d b){
    return _mm_div_pd(a,b);
  }
-  };
+};
-  struct Conj{
+struct Conj{
  // Complex single
  inline __m128 operator()(__m128 in){
    return _mm_xor_ps(_mm_addsub_ps(_mm_setzero_ps(),in), _mm_set1_ps(-0.f));
@@ -269,9 +269,9 @@ namespace Optimization {
    return _mm_xor_pd(_mm_addsub_pd(_mm_setzero_pd(),in), _mm_set1_pd(-0.f));//untested
  }
  // do not define for integer input
-  };
+};
-  struct TimesMinusI{
+struct TimesMinusI{
  //Complex single
  inline __m128 operator()(__m128 in, __m128 ret){
    __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
@@ -284,9 +284,9 @@ namespace Optimization {
  }
-  };
+};
-  struct TimesI{
+struct TimesI{
  //Complex single
  inline __m128 operator()(__m128 in, __m128 ret){
    __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
@@ -297,9 +297,9 @@ namespace Optimization {
    __m128d tmp = _mm_shuffle_pd(in,in,0x1);
    return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
  }
-  };
+};
-  struct Permute{
+struct Permute{
  static inline __m128 Permute0(__m128 in){
    return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
@@ -326,26 +326,25 @@ namespace Optimization {
  static inline __m128d Permute3(__m128d in){
    return in;
  };
-  };
+};
 #define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 #ifdef SFW_FP16
-  struct Grid_half {
+struct Grid_half {
  Grid_half(){}
  Grid_half(uint16_t raw) : x(raw) {}
  uint16_t x;
-  };
+};
-  union FP32 {
+union FP32 {
  unsigned int u;
  float f;
-  };
+};
-  // PAB - Lifted and adapted from Eigen, which is GPL V2
+// PAB - Lifted and adapted from Eigen, which is GPL V2
-  inline float sfw_half_to_float(Grid_half h) {
+inline float sfw_half_to_float(Grid_half h) {
  const FP32 magic = { 113 << 23 };
  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
  FP32 o;
@@ -361,8 +360,8 @@ namespace Optimization {
  }
  o.u |= (h.x & 0x8000) << 16;    // sign bit
  return o.f;
-  }
+}
-  inline Grid_half sfw_float_to_half(float ff) {
+inline Grid_half sfw_float_to_half(float ff) {
  FP32 f; f.f = ff;
  const FP32 f32infty = { 255 << 23 };
  const FP32 f16max = { (127 + 16) << 23 };
@@ -400,8 +399,8 @@ namespace Optimization {
  } 
  o.x |= static_cast<unsigned short>(sign >> 16);
  return o;
-  }
+}
-  static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
+static inline __m128i Grid_mm_cvtps_ph(__m128 f,int discard) {
  __m128i ret=(__m128i)_mm_setzero_ps();
  float *fp = (float *)&f;
  Grid_half *hp = (Grid_half *)&ret;
@@ -410,8 +409,8 @@ namespace Optimization {
  hp[2] = sfw_float_to_half(fp[2]);
  hp[3] = sfw_float_to_half(fp[3]);
  return ret;
-  }
+}
-  static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
+static inline __m128 Grid_mm_cvtph_ps(__m128i h,int discard) {
  __m128 ret=_mm_setzero_ps();
  float *fp = (float *)&ret;
  Grid_half  *hp = (Grid_half *)&h;
@@ -420,12 +419,12 @@ namespace Optimization {
  fp[2] = sfw_half_to_float(hp[2]);
  fp[3] = sfw_half_to_float(hp[3]);
  return ret;
-  }
+}
 #else 
 #define Grid_mm_cvtps_ph _mm_cvtps_ph
 #define Grid_mm_cvtph_ps _mm_cvtph_ps
 #endif
-  struct PrecisionChange {
+struct PrecisionChange {
  static inline __m128i StoH (__m128 a,__m128 b) {
    __m128i ha = Grid_mm_cvtps_ph(a,0);
    __m128i hb = Grid_mm_cvtps_ph(b,0);
@@ -460,9 +459,9 @@ namespace Optimization {
    StoD(sa,a,b);
    StoD(sb,c,d);
  }
-  };
+};
-  struct Exchange{
+struct Exchange{
  // 3210 ordering
  static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
    out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
@@ -499,9 +498,9 @@ namespace Optimization {
    assert(0);
    return;
  };
-  };
+};
-  struct Rotate{
+struct Rotate{
  static inline __m128 rotate(__m128 in,int n){ 
    switch(n){
@@ -523,23 +522,23 @@ namespace Optimization {
  template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); };
  template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); };
-  };
+};
-  //////////////////////////////////////////////
+//////////////////////////////////////////////
-  // Some Template specialization
+// Some Template specialization
-  //Complex float Reduce
+//Complex float Reduce
-  template<>
+template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
+inline Grid::ComplexF Reduce<Grid::ComplexF, __m128>::operator()(__m128 in){
  __m128 v1; // two complex
  v1= Optimization::Permute::Permute0(in); 
  v1= _mm_add_ps(v1,in);
  u128f conv;    conv.v=v1;
  return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
+}
-  //Real float Reduce
+//Real float Reduce
-  template<>
+template<>
-  inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
+inline Grid::RealF Reduce<Grid::RealF, __m128>::operator()(__m128 in){
  __m128 v1,v2; // quad single
  v1= Optimization::Permute::Permute0(in); 
  v1= _mm_add_ps(v1,in);
@@ -547,71 +546,64 @@ namespace Optimization {
  v1 = _mm_add_ps(v1,v2);
  u128f conv; conv.v=v1;
  return conv.f[0];
-  }
+}
-  
+//Complex double Reduce
-  //Complex double Reduce
+template<>
-  template<>
+inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m128d>::operator()(__m128d in){
  u128d conv; conv.v = in;
  return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
+}
-  //Real double Reduce
+//Real double Reduce
-  template<>
+template<>
-  inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
+inline Grid::RealD Reduce<Grid::RealD, __m128d>::operator()(__m128d in){
  __m128d v1;
  v1 = Optimization::Permute::Permute0(in); 
  v1 = _mm_add_pd(v1,in);
  u128d conv; conv.v = v1;
  return conv.f[0];
-  }
+}
-  //Integer Reduce
+//Integer Reduce
-  template<>
+template<>
-  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
+inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
  __m128i v1 = _mm_hadd_epi32(in, in);
  __m128i v2 = _mm_hadd_epi32(v1, v1);
  return _mm_cvtsi128_si32(v2);
  }
 }
-
+NAMESPACE_END(Optimization);
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 typedef __m128i SIMD_Htype;  // Single precision type
 typedef __m128  SIMD_Ftype;  // Single precision type
 typedef __m128d SIMD_Dtype; // Double precision type
 typedef __m128i SIMD_Itype; // Integer type
-  typedef __m128i SIMD_Htype;  // Single precision type
+// prefetch utilities
-  typedef __m128  SIMD_Ftype;  // Single precision type
+inline void v_prefetch0(int size, const char *ptr){};
-  typedef __m128d SIMD_Dtype; // Double precision type
+inline void prefetch_HINT_T0(const char *ptr){
  typedef __m128i SIMD_Itype; // Integer type
  // prefetch utilities
  inline void v_prefetch0(int size, const char *ptr){};
  inline void prefetch_HINT_T0(const char *ptr){
  _mm_prefetch(ptr,_MM_HINT_T0);
  }
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 }
 // Function name aliases
 typedef Optimization::Vsplat   VsplatSIMD;
 typedef Optimization::Vstore   VstoreSIMD;
 typedef Optimization::Vset     VsetSIMD;
 typedef Optimization::Vstream  VstreamSIMD;
 template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 // Arithmetic operations
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Div         DivSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::MultRealPart MultRealPartSIMD;
 typedef Optimization::MaddRealPart MaddRealPartSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
 typedef Optimization::TimesI      TimesISIMD;
 NAMESPACE_END(Grid);