Merge branch 'develop' into release/v0.6.0

2026-02-16 11:50:53 +00:00 · 2016-11-04 16:08:07 +00:00
parent c067051d5f 8af8b047fd
commit f7b60004f3
41 changed files with 1661 additions and 895 deletions
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -365,6 +365,18 @@ namespace Optimization {
    }
  };

+  struct Div{
+    // Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_div_ps(a,b);
+    }
+    // Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_div_pd(a,b);
+    }
+  };
+
+
  struct Conj{
    // Complex single
    inline __m256 operator()(__m256 in){
@@ -437,14 +449,13 @@ namespace Optimization {

  };

-#if defined (AVX2) || defined (AVXFMA4) 
-#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
-#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
+#if defined (AVX2)
+#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256)  _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
+#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 #endif

-#if defined (AVX1) || defined (AVXFMA)
-
-#define _mm256_alignr_epi32(ret,a,b,n) {	\
+#if defined (AVX1) || defined (AVXFMA)  
+#define _mm256_alignr_epi32_grid(ret,a,b,n) {	\
    __m128 aa, bb;				\
 						\
    aa  = _mm256_extractf128_ps(a,1);		\
@@ -458,7 +469,7 @@ namespace Optimization {
    ret = _mm256_insertf128_ps(ret,aa,0);	\
  }

-#define _mm256_alignr_epi64(ret,a,b,n) {	\
+#define _mm256_alignr_epi64_grid(ret,a,b,n) {	\
    __m128d aa, bb;				\
 						\
    aa  = _mm256_extractf128_pd(a,1);		\
@@ -474,19 +485,6 @@ namespace Optimization {

 #endif

-    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
-    {
-      const float *p=(const float *)&a;
-      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
-      return stream;
-    };
-    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
-    {
-      const double *p=(const double *)&a;
-      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
-      return stream;
-    };
-
  struct Rotate{

    static inline __m256 rotate(__m256 in,int n){ 
@@ -518,11 +516,10 @@ namespace Optimization {
      __m256 tmp = Permute::Permute0(in);
      __m256 ret;
      if ( n > 3 ) { 
-	_mm256_alignr_epi32(ret,in,tmp,n);  
+	_mm256_alignr_epi32_grid(ret,in,tmp,n);  
      } else {
-        _mm256_alignr_epi32(ret,tmp,in,n);          
+        _mm256_alignr_epi32_grid(ret,tmp,in,n);          
      }
-      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
      return ret;
    };

@@ -531,18 +528,15 @@ namespace Optimization {
      __m256d tmp = Permute::Permute0(in);
      __m256d ret;
      if ( n > 1 ) {
-	_mm256_alignr_epi64(ret,in,tmp,n);          
+	_mm256_alignr_epi64_grid(ret,in,tmp,n);          
      } else {
-        _mm256_alignr_epi64(ret,tmp,in,n);          
+        _mm256_alignr_epi64_grid(ret,tmp,in,n);          
      }
-      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
      return ret;
    };

  };

-
-
  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
@@ -631,6 +625,7 @@ namespace Optimization {
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -240,6 +240,17 @@ namespace Optimization {
    }
  };

+  struct Div{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_div_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_div_pd(a,b);
+    }
+  };
+

  struct Conj{
    // Complex single
@@ -497,6 +508,7 @@ namespace Optimization {
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -244,6 +244,17 @@ namespace Optimization {
    }
  };

+  struct Div{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_div_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_div_pd(a,b);
+    }
+  };
+

  struct Conj{
    // Complex single
@@ -437,6 +448,7 @@ namespace Optimization {
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -224,6 +224,18 @@ namespace Optimization {
    }
  };

+  struct Div{
+    // Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_div_ps(a,b);
+    }
+    // Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_div_pd(a,b);
+    }
+  };
+
+
  struct Conj{
    // Complex single
    inline __m128 operator()(__m128 in){
@@ -372,6 +384,8 @@ namespace Optimization {
  }
 }

+
+
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 

@@ -398,6 +412,7 @@ namespace Optimization {
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -77,38 +77,24 @@ struct RealPart<std::complex<T> > {
 //////////////////////////////////////
 // demote a vector to real type
 //////////////////////////////////////
-
 // type alias used to simplify the syntax of std::enable_if
-template <typename T>
-using Invoke = typename T::type;
-template <typename Condition, typename ReturnType>
-using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
-template <typename Condition, typename ReturnType>
-using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
+template <typename T> using Invoke = typename T::type;
+template <typename Condition, typename ReturnType> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;

 ////////////////////////////////////////////////////////
 // Check for complexity with type traits
-template <typename T>
-struct is_complex : public std::false_type {};
-template <>
-struct is_complex<std::complex<double> > : public std::true_type {};
-template <>
-struct is_complex<std::complex<float> > : public std::true_type {};
+template <typename T> struct is_complex : public std::false_type {};
+template <> struct is_complex<std::complex<double> > : public std::true_type {};
+template <> struct is_complex<std::complex<float> > : public std::true_type {};

-template <typename T>
-using IfReal = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
-template <typename T>
-using IfComplex = Invoke<std::enable_if<is_complex<T>::value, int> >;
-template <typename T>
-using IfInteger = Invoke<std::enable_if<std::is_integral<T>::value, int> >;
+template <typename T> using IfReal       = Invoke<std::enable_if<std::is_floating_point<T>::value, int> >;
+template <typename T> using IfComplex    = Invoke<std::enable_if<is_complex<T>::value, int> >;
+template <typename T> using IfInteger    = Invoke<std::enable_if<std::is_integral<T>::value, int> >;

-template <typename T>
-using IfNotReal =
-    Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
-template <typename T>
-using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
-template <typename T>
-using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;
+template <typename T> using IfNotReal    = Invoke<std::enable_if<!std::is_floating_point<T>::value, int> >;
+template <typename T> using IfNotComplex = Invoke<std::enable_if<!is_complex<T>::value, int> >;
+template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integral<T>::value, int> >;

 ////////////////////////////////////////////////////////
 // Define the operation templates functors
@@ -285,6 +271,20 @@ class Grid_simd {
    return a * b;
  }

+  //////////////////////////////////
+  // Divides
+  //////////////////////////////////
+  friend inline Grid_simd operator/(const Scalar_type &a, Grid_simd b) {
+    Grid_simd va;
+    vsplat(va, a);
+    return va / b;
+  }
+  friend inline Grid_simd operator/(Grid_simd b, const Scalar_type &a) {
+    Grid_simd va;
+    vsplat(va, a);
+    return b / a;
+  }
+
  ///////////////////////
  // Unary negation
  ///////////////////////
@@ -428,7 +428,6 @@ inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
  ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 }

-
 template <class S, class V> 
 inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
  S* typepun =(S*) &src;
@@ -512,7 +511,6 @@ template <class S, class V, IfInteger<S> = 0>
 inline void vfalse(Grid_simd<S, V> &ret) {
  vsplat(ret, 0);
 }
-
 template <class S, class V>
 inline void zeroit(Grid_simd<S, V> &z) {
  vzero(z);
@@ -530,7 +528,6 @@ inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
  typedef typename S::value_type T;
  binary<void>((T *)&out.v, in.v, VstreamSIMD());
 }
-
 template <class S, class V, IfInteger<S> = 0>
 inline void vstream(Grid_simd<S, V> &out, const Grid_simd<S, V> &in) {
  out = in;
@@ -569,6 +566,34 @@ inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
  return ret;
 };

+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  typedef Grid_simd<S, V> simd;
+
+  simd ret;
+  simd den;
+  typename simd::conv_t conv;
+
+  ret = a * conjugate(b) ;
+  den = b * conjugate(b) ;
+
+  
+  auto real_den = toReal(den);
+
+  ret.v=binary<V>(ret.v, real_den.v, DivSIMD());
+
+  return ret;
+};
+
+// Real/Integer types
+template <class S, class V, IfNotComplex<S> = 0>
+inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, DivSIMD());
+  return ret;
+};
+
 ///////////////////////
 // Conjugate
 ///////////////////////
@@ -582,7 +607,6 @@ template <class S, class V, IfNotComplex<S> = 0>
 inline Grid_simd<S, V> conjugate(const Grid_simd<S, V> &in) {
  return in;  // for real objects
 }
-
 // Suppress adj for integer types... // odd; why conjugate above but not adj??
 template <class S, class V, IfNotInteger<S> = 0>
 inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
@@ -596,14 +620,12 @@ template <class S, class V, IfComplex<S> = 0>
 inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
  ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD());
 }
-
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
  Grid_simd<S, V> ret;
  timesMinusI(ret, in);
  return ret;
 }
-
 template <class S, class V, IfNotComplex<S> = 0>
 inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
  return in;
@@ -616,14 +638,12 @@ template <class S, class V, IfComplex<S> = 0>
 inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
  ret.v = binary<V>(in.v, ret.v, TimesISIMD());
 }
-
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
  Grid_simd<S, V> ret;
  timesI(ret, in);
  return ret;
 }
-
 template <class S, class V, IfNotComplex<S> = 0>
 inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
  return in;