Merge branch 'develop' into feature/feynman-rules

# Conflicts: # lib/Threads.h # lib/qcd/action/fermion/WilsonFermion.cc # lib/qcd/action/fermion/WilsonFermion.h # lib/qcd/utils/SUn.h # lib/simd/Grid_avx.h # lib/simd/Intel512common.h
2026-01-05 17:39:34 +00:00 · 2016-10-19 18:35:18 +01:00
parent a123dcd7e9 7af9b87318
commit 997fd882ff
84 changed files with 6162 additions and 2851 deletions
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -370,7 +370,67 @@ namespace Optimization {

  //////////////////////////////////////////////
  // Some Template specialization
+
+  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
+#undef GNU_CLANG_COMPILER 
+#ifdef GNU_CLANG_COMPILER
+  //Complex float Reduce
+  template<>
+    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    __m512 v1,v2;
+    v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
+    v1= _mm512_add_ps(v1,in);
+    v2=Optimization::Permute::Permute1(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2=Optimization::Permute::Permute2(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    u512f conv; conv.v = v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
+  }
  
+  //Real float Reduce
+  template<>
+    inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    __m512 v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
+    v1 = _mm512_add_ps(v1,in);
+    v2 = Optimization::Permute::Permute1(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute2(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    v2 = Optimization::Permute::Permute3(v1); 
+    v1 = _mm512_add_ps(v1,v2);
+    u512f conv; conv.v=v1;
+    return conv.f[0];
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+    inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    __m512d v1;
+    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
+    v1 = _mm512_add_pd(v1,in);
+    v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
+    v1 = _mm512_add_pd(v1,in);
+    u512d conv; conv.v = v1;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
+  }
+  
+  //Real double Reduce
+  template<>
+    inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    __m512d v1,v2;
+    v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
+    v1 = _mm512_add_pd(v1,in);
+      v2 = Optimization::Permute::Permute1(v1); 
+      v1 = _mm512_add_pd(v1,v2);
+      v2 = Optimization::Permute::Permute2(v1); 
+      v1 = _mm512_add_pd(v1,v2);
+     u512d conv; conv.v = v1;
+     return conv.f[0];
+  }
+#else
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
@@ -382,7 +442,6 @@ namespace Optimization {
    return _mm512_reduce_add_ps(in);
  }
  
-  
  //Complex double Reduce
  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
@@ -402,6 +461,7 @@ namespace Optimization {
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
+#endif
  
  
 }