From e1042aef77f70f2c4ab44d9066fd4c6d093f6e50 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Fri, 28 Oct 2016 17:20:04 +0100 Subject: [PATCH] First version of the doube prec for testing purposes It does not compile single and double version at the same time --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 81 ++++++++++++++++++- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 4 +- lib/simd/Grid_avx512.h | 6 -- 3 files changed, 80 insertions(+), 11 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 74862400..2fc9b035 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -53,12 +53,13 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo, } #if defined(AVX512) - +#include + +#if defined(GRID_DEFAULT_PRECISION_SINGLE) /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// - -#include + #include static Vector signs; @@ -78,6 +79,7 @@ static Vector signs; #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A +#define COMPLEX_TYPE vComplexF #undef KERNEL_DAG template<> void @@ -113,8 +115,79 @@ template<> void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +#undef COMPLEX_TYPE + +#endif //Single precision + +#if defined(GRID_DEFAULT_PRECISION_DOUBLE) +//temporary separating the two sections +//for debug in isolation +//can be unified + + /////////////////////////////////////////////////////////// + // If we are AVX512 specialise the double precision routine + /////////////////////////////////////////////////////////// + +#include + +static Vector signs; + + int setupSigns(void ){ + Vector bother(2); + signs = bother; + vrsign(signs[0]); + visign(signs[1]); + return 1; + } + static int signInit = setupSigns(); + +#define label(A) ilabel(A) +#define ilabel(A) ".globl\n" #A ":\n" + +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +#define FX(A) WILSONASM_ ##A +#define COMPLEX_TYPE vComplexD + +#undef KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#define KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include -#endif +#undef VMOVIDUP +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A +#define MAYBEPERM(A,B) +#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) +#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +#undef KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#define KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef COMPLEX_TYPE +#endif //Double precision + +#endif //AVX512 #define INSTANTIATE_ASM(A)\ template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 12579d8c..72e13754 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -5,7 +5,9 @@ const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; - vComplexF *isigns = &signs[0]; + //COMPLEX_TYPE is vComplexF of vComplexD depending + //on the chosen precision + COMPLEX_TYPE *isigns = &signs[0]; MASK_REGS; int nmax=U._grid->oSites(); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 62789462..136c940e 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -371,14 +371,8 @@ namespace Optimization { // Some Template specialization // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases -<<<<<<< HEAD -#define GNU_CLANG_COMPILER -#ifdef GNU_CLANG_COMPILER -======= - #ifndef __INTEL_COMPILER #warning "Slow reduction due to incomplete reduce intrinsics" ->>>>>>> develop //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(__m512 in){