1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Fixing AVX Z-mobius

This commit is contained in:
paboyle 2016-12-18 02:05:11 +00:00
parent 87be03006a
commit 3e6945cd65
2 changed files with 49 additions and 29 deletions

View File

@ -33,9 +33,29 @@ namespace Grid {
namespace QCD {
template<typename T> struct switcheroo { static int iscomplex() { return 0; } };
template<> struct switcheroo<ComplexD> { static int iscomplex() { return 1; } };
template<> struct switcheroo<ComplexF> { static int iscomplex() { return 1; } };
template<typename T> struct switcheroo {
static inline int iscomplex() { return 0; }
template<class vec>
static inline vec mult(vec a, vec b) {
return real_mult(a,b);
}
};
template<> struct switcheroo<ComplexD> {
static inline int iscomplex() { return 1; }
template<class vec>
static inline vec mult(vec a, vec b) {
return a*b;
}
};
template<> struct switcheroo<ComplexF> {
static inline int iscomplex() { return 1; }
template<class vec>
static inline vec mult(vec a, vec b) {
return a*b;
}
};
template<class Impl>

View File

@ -161,18 +161,18 @@ PARALLEL_FOR_LOOP
}
// Can force these to real arithmetic and save 2x.
Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(l[v]()()(),hm_00);
Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(l[v]()()(),hm_01);
Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(l[v]()()(),hm_02);
Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(l[v]()()(),hm_10);
Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(l[v]()()(),hm_11);
Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(l[v]()()(),hm_12);
Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(u[v]()()(),hp_00);
Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(u[v]()()(),hp_01);
Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(u[v]()()(),hp_02);
Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(u[v]()()(),hp_10);
Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(u[v]()()(),hp_11);
Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(u[v]()()(),hp_12);
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01);
@ -299,19 +299,19 @@ PARALLEL_FOR_LOOP
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(u[v]()()(),hp_00);
Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(u[v]()()(),hp_01);
Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(u[v]()()(),hp_02);
Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(u[v]()()(),hp_10);
Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(u[v]()()(),hp_11);
Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(u[v]()()(),hp_12);
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(l[v]()()(),hm_00);
Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(l[v]()()(),hm_01);
Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(l[v]()()(),hm_02);
Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(l[v]()()(),hm_10);
Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(l[v]()()(),hm_11);
Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(l[v]()()(),hm_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01);
@ -544,13 +544,13 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
for(int co=0;co<Nc;co++){
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
}}
if ( s2==0 && l==0) {
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
}}
}
}}
{