Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion

2025-08-06 06:27:12 +01:00 · 2019-08-07 12:11:40 +01:00
parent 67690df3bd bca36d9bc3
commit d566637cec
17 changed files with 110 additions and 192 deletions
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {
 namespace QCD {
--- a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -26,11 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 #ifdef AVX512
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include <Grid/simd/Intel512common.h>
+#include <Grid/simd/Intel512avx.h>
 #endif

 // Interleave operations from two directions
@@ -679,7 +679,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge3 =(uint64_t)&UU._odata[sU]( T ); 
  
  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeField &U, DoubledGaugeField &UUU,
 								    SiteSpinor *buf, int LLs, int sU, 
@@ -732,7 +732,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
   
 }

-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeField &U, DoubledGaugeField &UUU,
 								    SiteSpinor *buf, int LLs, int sU, 
@@ -816,7 +816,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl

  // This is the single precision 5th direction vectorised kernel

-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeField &U, DoubledGaugeField &UUU,
 							       SiteSpinor *buf, int LLs, int sU, 
@@ -884,7 +884,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 #endif
 }

-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeField &U, DoubledGaugeField &UUU,
 							       SiteSpinor *buf, int LLs, int sU, 
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>


 #define LOAD_CHI(b)		\
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -81,8 +81,8 @@ WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,Doubl
  assert(0);
 }

-#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>

 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -66,6 +66,7 @@ namespace QCD{
      FermionField Phi; // the pseudofermion field for this trajectory

    public:
+
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCG, 
--- a/Grid/simd/Grid_avx512.h
+++ b/Grid/simd/Grid_avx512.h
@@ -485,83 +485,6 @@ namespace Optimization {
  // Some Template specialization

  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#ifndef __INTEL_COMPILER
-#warning "Slow reduction due to incomplete reduce intrinsics"
-  //Complex float Reduce
-  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
-    __m512 v1,v2;
-    v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
-    v1= _mm512_add_ps(v1,in);
-    v2=Optimization::Permute::Permute1(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2=Optimization::Permute::Permute2(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    u512f conv; conv.v = v1;
-    return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
-  
-  //Real float Reduce
-  template<>
-    inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
-    __m512 v1,v2;
-    v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
-    v1 = _mm512_add_ps(v1,in);
-    v2 = Optimization::Permute::Permute1(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2 = Optimization::Permute::Permute2(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2 = Optimization::Permute::Permute3(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    u512f conv; conv.v=v1;
-    return conv.f[0];
-  }
-  
-  
-  //Complex double Reduce
-  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
-    __m512d v1;
-    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
-    v1 = _mm512_add_pd(v1,in);
-    v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
-    v1 = _mm512_add_pd(v1,in);
-    u512d conv; conv.v = v1;
-    return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
-  
-  //Real double Reduce
-  template<>
-    inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
-    __m512d v1,v2;
-    v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
-    v1 = _mm512_add_pd(v1,in);
-      v2 = Optimization::Permute::Permute1(v1); 
-      v1 = _mm512_add_pd(v1,v2);
-      v2 = Optimization::Permute::Permute2(v1); 
-      v1 = _mm512_add_pd(v1,v2);
-     u512d conv; conv.v = v1;
-     return conv.f[0];
-  }
-  
-  //Integer Reduce
-  template<>
-  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // No full vector reduce, use AVX to add upper and lower halves of register
-    // and perform AVX reduction.
-    __m256i v1, v2, v3;
-    __m128i u1, u2, ret;
-    v1  = _mm512_castsi512_si256(in);       // upper half
-    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
-    v3  = _mm256_add_epi32(v1, v2);
-    v1  = _mm256_hadd_epi32(v3, v3);
-    v2  = _mm256_hadd_epi32(v1, v1);
-    u1  = _mm256_castsi256_si128(v2);        // upper half
-    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
-    ret = _mm_add_epi32(u1, u2);
-    return _mm_cvtsi128_si32(ret);
-  }
-#else
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
@@ -590,8 +513,6 @@ namespace Optimization {
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
    return _mm512_reduce_add_epi32(in);
  }
-#endif
-  
  
 }