diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc index c4eaf0f3..03808b02 100644 --- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc +++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc @@ -28,66 +28,65 @@ with this program; if not, write to the Free Software Foundation, Inc., See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ -/* END LEGAL */ + /* END LEGAL */ #include #include -namespace Grid { -namespace QCD { +NAMESPACE_BEGIN(Grid); - /* - * Dense matrix versions of routines - */ - template - void MobiusEOFAFermion::MooeeInv(const FermionField& psi, FermionField& chi) - { - this->MooeeInternal(psi, chi, DaggerNo, InverseYes); - } +/* + * Dense matrix versions of routines + */ +template +void MobiusEOFAFermion::MooeeInv(const FermionField& psi, FermionField& chi) +{ + this->MooeeInternal(psi, chi, DaggerNo, InverseYes); +} - template - void MobiusEOFAFermion::MooeeInv_shift(const FermionField& psi, FermionField& chi) - { - this->MooeeInternal(psi, chi, DaggerNo, InverseYes); - } +template +void MobiusEOFAFermion::MooeeInv_shift(const FermionField& psi, FermionField& chi) +{ + this->MooeeInternal(psi, chi, DaggerNo, InverseYes); +} - template - void MobiusEOFAFermion::MooeeInvDag(const FermionField& psi, FermionField& chi) - { - this->MooeeInternal(psi, chi, DaggerYes, InverseYes); - } +template +void MobiusEOFAFermion::MooeeInvDag(const FermionField& psi, FermionField& chi) +{ + this->MooeeInternal(psi, chi, DaggerYes, InverseYes); +} - template - void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField& psi, FermionField& chi) - { - this->MooeeInternal(psi, chi, DaggerYes, InverseYes); - } +template +void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField& psi, FermionField& chi) +{ + this->MooeeInternal(psi, chi, DaggerYes, InverseYes); +} - template - void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) - { - GridBase* grid = psi._grid; - int Ls = this->Ls; - int LLs = grid->_rdimensions[0]; - const int nsimd = Simd::Nsimd(); +template +void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, + FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) +{ + GridBase* grid = psi._grid; + int Ls = this->Ls; + int LLs = grid->_rdimensions[0]; + const int nsimd = Simd::Nsimd(); - Vector> u(LLs); - Vector> l(LLs); - Vector> d(LLs); + Vector> u(LLs); + Vector> l(LLs); + Vector> d(LLs); - assert(Ls/LLs == nsimd); - assert(phi.checkerboard == psi.checkerboard); + assert(Ls/LLs == nsimd); + assert(phi.checkerboard == psi.checkerboard); - chi.checkerboard = psi.checkerboard; + chi.checkerboard = psi.checkerboard; - // just directly address via type pun - typedef typename Simd::scalar_type scalar_type; - scalar_type* u_p = (scalar_type*) &u[0]; - scalar_type* l_p = (scalar_type*) &l[0]; - scalar_type* d_p = (scalar_type*) &d[0]; + // just directly address via type pun + typedef typename Simd::scalar_type scalar_type; + scalar_type* u_p = (scalar_type*) &u[0]; + scalar_type* l_p = (scalar_type*) &l[0]; + scalar_type* d_p = (scalar_type*) &d[0]; - for(int o=0; oM5Dcalls++; - this->M5Dtime -= usecond(); + this->M5Dcalls++; + this->M5Dtime -= usecond(); - assert(Nc == 3); + assert(Nc == 3); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs - #if 0 +#if 0 - alignas(64) SiteHalfSpinor hp; - alignas(64) SiteHalfSpinor hm; - alignas(64) SiteSpinor fp; - alignas(64) SiteSpinor fm; + alignas(64) SiteHalfSpinor hp; + alignas(64) SiteHalfSpinor hm; + alignas(64) SiteSpinor fp; + alignas(64) SiteSpinor fm; - for(int v=0; v= v){ rotate(hm, hm, nsimd-1); } + if (vp <= v){ rotate(hp, hp, 1); } + if (vm >= v){ rotate(hm, hm, nsimd-1); } - hp = 0.5*hp; - hm = 0.5*hm; + hp = 0.5*hp; + hm = 0.5*hm; - spRecon5m(fp, hp); - spRecon5p(fm, hm); + spRecon5m(fp, hp); + spRecon5p(fm, hm); - chi[ss+v] = d[v]*phi[ss+v]; - chi[ss+v] = chi[ss+v] + u[v]*fp; - chi[ss+v] = chi[ss+v] + l[v]*fm; + chi[ss+v] = d[v]*phi[ss+v]; + chi[ss+v] = chi[ss+v] + u[v]*fp; + chi[ss+v] = chi[ss+v] + l[v]*fm; - } - - #else - - for(int v=0; v(hp_00.v); - hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); - hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); - hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); - hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); - hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); - } - - if(vm >= v){ - hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); - hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); - hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); - hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); - hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); - hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); - } - - // Can force these to real arithmetic and save 2x. - Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00); - Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01); - Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02); - Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10); - Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11); - Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12); - Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00); - Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01); - Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02); - Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10); - Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11); - Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12); - - vstream(chi[ss+v]()(0)(0), p_00); - vstream(chi[ss+v]()(0)(1), p_01); - vstream(chi[ss+v]()(0)(2), p_02); - vstream(chi[ss+v]()(1)(0), p_10); - vstream(chi[ss+v]()(1)(1), p_11); - vstream(chi[ss+v]()(1)(2), p_12); - vstream(chi[ss+v]()(2)(0), p_20); - vstream(chi[ss+v]()(2)(1), p_21); - vstream(chi[ss+v]()(2)(2), p_22); - vstream(chi[ss+v]()(3)(0), p_30); - vstream(chi[ss+v]()(3)(1), p_31); - vstream(chi[ss+v]()(3)(2), p_32); - } - - #endif } - this->M5Dtime += usecond(); - } +#else - template - void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs) - { - #if 0 + for(int v=0; vM5D(psi, phi, chi, lower, diag, upper); + vprefetch(psi[ss+v+LLs]); - // FIXME: possible gain from vectorizing shift operation as well? - Coeff_t one(1.0); - int Ls = this->Ls; - for(int s=0; spm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); } - else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); } + int vp = (v == LLs-1) ? 0 : v+1; + int vm = (v == 0) ? LLs-1 : v-1; + + Simd hp_00 = psi[ss+vp]()(2)(0); + Simd hp_01 = psi[ss+vp]()(2)(1); + Simd hp_02 = psi[ss+vp]()(2)(2); + Simd hp_10 = psi[ss+vp]()(3)(0); + Simd hp_11 = psi[ss+vp]()(3)(1); + Simd hp_12 = psi[ss+vp]()(3)(2); + + Simd hm_00 = psi[ss+vm]()(0)(0); + Simd hm_01 = psi[ss+vm]()(0)(1); + Simd hm_02 = psi[ss+vm]()(0)(2); + Simd hm_10 = psi[ss+vm]()(1)(0); + Simd hm_11 = psi[ss+vm]()(1)(1); + Simd hm_12 = psi[ss+vm]()(1)(2); + + if(vp <= v){ + hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } - #else - - GridBase* grid = psi._grid; - int Ls = this->Ls; - int LLs = grid->_rdimensions[0]; - const int nsimd = Simd::Nsimd(); - - Vector> u(LLs); - Vector> l(LLs); - Vector> d(LLs); - Vector> s(LLs); - - assert(Ls/LLs == nsimd); - assert(phi.checkerboard == psi.checkerboard); - - chi.checkerboard = psi.checkerboard; - - // just directly address via type pun - typedef typename Simd::scalar_type scalar_type; - scalar_type* u_p = (scalar_type*) &u[0]; - scalar_type* l_p = (scalar_type*) &l[0]; - scalar_type* d_p = (scalar_type*) &d[0]; - scalar_type* s_p = (scalar_type*) &s[0]; - - for(int o=0; oM5Dcalls++; - this->M5Dtime -= usecond(); - - assert(Nc == 3); - - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs - - int vs = (this->pm == 1) ? LLs-1 : 0; - Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0); - Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1); - Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2); - Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0); - Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1); - Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2); - - for(int v=0; v(hp_00.v); - hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); - hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); - hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); - hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); - hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); - } - - if(this->pm == 1 && vs <= v){ - hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v); - hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v); - hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v); - hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v); - hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v); - hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v); - } - - if(vm >= v){ - hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); - hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); - hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); - hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); - hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); - hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); - } - - if(this->pm == -1 && vs >= v){ - hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v); - hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v); - hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v); - hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v); - hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v); - hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v); - } - - // Can force these to real arithmetic and save 2x. - Simd p_00 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00) - + switcheroo::mult(s[v]()()(), hs_00); - Simd p_01 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01) - + switcheroo::mult(s[v]()()(), hs_01); - Simd p_02 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02) - + switcheroo::mult(s[v]()()(), hs_02); - Simd p_10 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10) - + switcheroo::mult(s[v]()()(), hs_10); - Simd p_11 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11) - + switcheroo::mult(s[v]()()(), hs_11); - Simd p_12 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12) - + switcheroo::mult(s[v]()()(), hs_12); - Simd p_20 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00) - + switcheroo::mult(s[v]()()(), hs_00) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00); - Simd p_21 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01) - + switcheroo::mult(s[v]()()(), hs_01) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01); - Simd p_22 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02) - + switcheroo::mult(s[v]()()(), hs_02) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02); - Simd p_30 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10) - + switcheroo::mult(s[v]()()(), hs_10) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10); - Simd p_31 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11) - + switcheroo::mult(s[v]()()(), hs_11) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11); - Simd p_32 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12) - + switcheroo::mult(s[v]()()(), hs_12) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12); - - vstream(chi[ss+v]()(0)(0), p_00); - vstream(chi[ss+v]()(0)(1), p_01); - vstream(chi[ss+v]()(0)(2), p_02); - vstream(chi[ss+v]()(1)(0), p_10); - vstream(chi[ss+v]()(1)(1), p_11); - vstream(chi[ss+v]()(1)(2), p_12); - vstream(chi[ss+v]()(2)(0), p_20); - vstream(chi[ss+v]()(2)(1), p_21); - vstream(chi[ss+v]()(2)(2), p_22); - vstream(chi[ss+v]()(3)(0), p_30); - vstream(chi[ss+v]()(3)(1), p_31); - vstream(chi[ss+v]()(3)(2), p_32); - } + if(vm >= v){ + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } - this->M5Dtime += usecond(); + // Can force these to real arithmetic and save 2x. + Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12); + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12); - #endif + vstream(chi[ss+v]()(0)(0), p_00); + vstream(chi[ss+v]()(0)(1), p_01); + vstream(chi[ss+v]()(0)(2), p_02); + vstream(chi[ss+v]()(1)(0), p_10); + vstream(chi[ss+v]()(1)(1), p_11); + vstream(chi[ss+v]()(1)(2), p_12); + vstream(chi[ss+v]()(2)(0), p_20); + vstream(chi[ss+v]()(2)(1), p_21); + vstream(chi[ss+v]()(2)(2), p_22); + vstream(chi[ss+v]()(3)(0), p_30); + vstream(chi[ss+v]()(3)(1), p_31); + vstream(chi[ss+v]()(3)(2), p_32); + } + +#endif } - template - void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) - { - GridBase* grid = psi._grid; - int Ls = this->Ls; - int LLs = grid->_rdimensions[0]; - int nsimd = Simd::Nsimd(); + this->M5Dtime += usecond(); +} - Vector> u(LLs); - Vector> l(LLs); - Vector> d(LLs); +template +void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionField& phi, + FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs) +{ +#if 0 - assert(Ls/LLs == nsimd); - assert(phi.checkerboard == psi.checkerboard); + this->M5D(psi, phi, chi, lower, diag, upper); - chi.checkerboard = psi.checkerboard; + // FIXME: possible gain from vectorizing shift operation as well? + Coeff_t one(1.0); + int Ls = this->Ls; + for(int s=0; spm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); } + else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); } + } - // just directly address via type pun - typedef typename Simd::scalar_type scalar_type; - scalar_type* u_p = (scalar_type*) &u[0]; - scalar_type* l_p = (scalar_type*) &l[0]; - scalar_type* d_p = (scalar_type*) &d[0]; +#else - for(int o=0; oLs; + int LLs = grid->_rdimensions[0]; + const int nsimd = Simd::Nsimd(); + + Vector> u(LLs); + Vector> l(LLs); + Vector> d(LLs); + Vector> s(LLs); + + assert(Ls/LLs == nsimd); + assert(phi.checkerboard == psi.checkerboard); + + chi.checkerboard = psi.checkerboard; + + // just directly address via type pun + typedef typename Simd::scalar_type scalar_type; + scalar_type* u_p = (scalar_type*) &u[0]; + scalar_type* l_p = (scalar_type*) &l[0]; + scalar_type* d_p = (scalar_type*) &d[0]; + scalar_type* s_p = (scalar_type*) &s[0]; + + for(int o=0; oM5Dcalls++; + this->M5Dtime -= usecond(); + + assert(Nc == 3); + + parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + + int vs = (this->pm == 1) ? LLs-1 : 0; + Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0); + Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1); + Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2); + Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0); + Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1); + Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2); + + for(int v=0; v(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + + if(this->pm == 1 && vs <= v){ + hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v); + hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v); + hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v); + hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v); + hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v); + hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v); + } + + if(vm >= v){ + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } + + if(this->pm == -1 && vs >= v){ + hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v); + hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v); + hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v); + hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v); + hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v); + hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v); + } + + // Can force these to real arithmetic and save 2x. + Simd p_00 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00) + + switcheroo::mult(s[v]()()(), hs_00); + Simd p_01 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01) + + switcheroo::mult(s[v]()()(), hs_01); + Simd p_02 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02) + + switcheroo::mult(s[v]()()(), hs_02); + Simd p_10 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10) + + switcheroo::mult(s[v]()()(), hs_10); + Simd p_11 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11) + + switcheroo::mult(s[v]()()(), hs_11); + Simd p_12 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12) + + switcheroo::mult(s[v]()()(), hs_12); + Simd p_20 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00) + + switcheroo::mult(s[v]()()(), hs_00) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00); + Simd p_21 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01) + + switcheroo::mult(s[v]()()(), hs_01) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01); + Simd p_22 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02) + + switcheroo::mult(s[v]()()(), hs_02) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02); + Simd p_30 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10) + + switcheroo::mult(s[v]()()(), hs_10) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10); + Simd p_31 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11) + + switcheroo::mult(s[v]()()(), hs_11) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11); + Simd p_32 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12) + + switcheroo::mult(s[v]()()(), hs_12) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12); + + vstream(chi[ss+v]()(0)(0), p_00); + vstream(chi[ss+v]()(0)(1), p_01); + vstream(chi[ss+v]()(0)(2), p_02); + vstream(chi[ss+v]()(1)(0), p_10); + vstream(chi[ss+v]()(1)(1), p_11); + vstream(chi[ss+v]()(1)(2), p_12); + vstream(chi[ss+v]()(2)(0), p_20); + vstream(chi[ss+v]()(2)(1), p_21); + vstream(chi[ss+v]()(2)(2), p_22); + vstream(chi[ss+v]()(3)(0), p_30); + vstream(chi[ss+v]()(3)(1), p_31); + vstream(chi[ss+v]()(3)(2), p_32); + } + } + + this->M5Dtime += usecond(); + +#endif +} + +template +void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, + FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) +{ + GridBase* grid = psi._grid; + int Ls = this->Ls; + int LLs = grid->_rdimensions[0]; + int nsimd = Simd::Nsimd(); + + Vector> u(LLs); + Vector> l(LLs); + Vector> d(LLs); + + assert(Ls/LLs == nsimd); + assert(phi.checkerboard == psi.checkerboard); + + chi.checkerboard = psi.checkerboard; + + // just directly address via type pun + typedef typename Simd::scalar_type scalar_type; + scalar_type* u_p = (scalar_type*) &u[0]; + scalar_type* l_p = (scalar_type*) &l[0]; + scalar_type* d_p = (scalar_type*) &d[0]; + + for(int o=0; oM5Dcalls++; - this->M5Dtime -= usecond(); + this->M5Dcalls++; + this->M5Dtime -= usecond(); - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs - #if 0 +#if 0 - alignas(64) SiteHalfSpinor hp; - alignas(64) SiteHalfSpinor hm; - alignas(64) SiteSpinor fp; - alignas(64) SiteSpinor fm; + alignas(64) SiteHalfSpinor hp; + alignas(64) SiteHalfSpinor hm; + alignas(64) SiteSpinor fp; + alignas(64) SiteSpinor fm; - for(int v=0; v= v){ rotate(hm, hm, nsimd-1); } + if(vp <= v){ rotate(hp, hp, 1); } + if(vm >= v){ rotate(hm, hm, nsimd-1); } - hp = hp*0.5; - hm = hm*0.5; - spRecon5p(fp, hp); - spRecon5m(fm, hm); + hp = hp*0.5; + hm = hm*0.5; + spRecon5p(fp, hp); + spRecon5m(fm, hm); - chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; - chi[ss+v] = chi[ss+v] +l[v]*fm; - - } - - #else - - for(int v=0; v(hp_00.v); - hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); - hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); - hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); - hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); - hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); - } - - if(vm >= v){ - hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); - hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); - hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); - hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); - hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); - hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); - } - - Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00); - Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01); - Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02); - Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10); - Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11); - Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12); - Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00); - Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01); - Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02); - Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10); - Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11); - Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12); - - vstream(chi[ss+v]()(0)(0), p_00); - vstream(chi[ss+v]()(0)(1), p_01); - vstream(chi[ss+v]()(0)(2), p_02); - vstream(chi[ss+v]()(1)(0), p_10); - vstream(chi[ss+v]()(1)(1), p_11); - vstream(chi[ss+v]()(1)(2), p_12); - vstream(chi[ss+v]()(2)(0), p_20); - vstream(chi[ss+v]()(2)(1), p_21); - vstream(chi[ss+v]()(2)(2), p_22); - vstream(chi[ss+v]()(3)(0), p_30); - vstream(chi[ss+v]()(3)(1), p_31); - vstream(chi[ss+v]()(3)(2), p_32); - - } - - #endif + chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; + chi[ss+v] = chi[ss+v] +l[v]*fm; } - this->M5Dtime += usecond(); - } +#else - template - void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const FermionField& phi, - FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, - std::vector& shift_coeffs) - { - #if 0 + for(int v=0; vM5Ddag(psi, phi, chi, lower, diag, upper); + vprefetch(psi[ss+v+LLs]); - // FIXME: possible gain from vectorizing shift operation as well? - Coeff_t one(1.0); - int Ls = this->Ls; - for(int s=0; spm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); } - else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); } + int vp = (v == LLs-1) ? 0 : v+1; + int vm = (v == 0 ) ? LLs-1 : v-1; + + Simd hp_00 = psi[ss+vp]()(0)(0); + Simd hp_01 = psi[ss+vp]()(0)(1); + Simd hp_02 = psi[ss+vp]()(0)(2); + Simd hp_10 = psi[ss+vp]()(1)(0); + Simd hp_11 = psi[ss+vp]()(1)(1); + Simd hp_12 = psi[ss+vp]()(1)(2); + + Simd hm_00 = psi[ss+vm]()(2)(0); + Simd hm_01 = psi[ss+vm]()(2)(1); + Simd hm_02 = psi[ss+vm]()(2)(2); + Simd hm_10 = psi[ss+vm]()(3)(0); + Simd hm_11 = psi[ss+vm]()(3)(1); + Simd hm_12 = psi[ss+vm]()(3)(2); + + if (vp <= v){ + hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } - #else - - GridBase* grid = psi._grid; - int Ls = this->Ls; - int LLs = grid->_rdimensions[0]; - int nsimd = Simd::Nsimd(); - - Vector> u(LLs); - Vector> l(LLs); - Vector> d(LLs); - Vector> s(LLs); - - assert(Ls/LLs == nsimd); - assert(phi.checkerboard == psi.checkerboard); - - chi.checkerboard = psi.checkerboard; - - // just directly address via type pun - typedef typename Simd::scalar_type scalar_type; - scalar_type* u_p = (scalar_type*) &u[0]; - scalar_type* l_p = (scalar_type*) &l[0]; - scalar_type* d_p = (scalar_type*) &d[0]; - scalar_type* s_p = (scalar_type*) &s[0]; - - for(int o=0; oM5Dcalls++; - this->M5Dtime -= usecond(); - - parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs - - int vs = (this->pm == 1) ? LLs-1 : 0; - Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0); - Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1); - Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2); - Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0); - Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1); - Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2); - - for(int v=0; v(hp_00.v); - hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); - hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); - hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); - hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); - hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); - } - - if(this->pm == 1 && vs <= v){ - hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v); - hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v); - hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v); - hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v); - hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v); - hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v); - } - - if(vm >= v){ - hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); - hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); - hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); - hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); - hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); - hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); - } - - if(this->pm == -1 && vs >= v){ - hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v); - hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v); - hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v); - hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v); - hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v); - hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v); - } - - Simd p_00 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00) - + switcheroo::mult(s[v]()()(), hs_00) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00); - Simd p_01 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01) - + switcheroo::mult(s[v]()()(), hs_01) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01); - Simd p_02 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02) - + switcheroo::mult(s[v]()()(), hs_02) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02); - Simd p_10 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10) - + switcheroo::mult(s[v]()()(), hs_10) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10); - Simd p_11 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11) - + switcheroo::mult(s[v]()()(), hs_11) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11); - Simd p_12 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12) - + switcheroo::mult(s[v]()()(), hs_12) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12); - Simd p_20 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00) - + switcheroo::mult(s[v]()()(), hs_00); - Simd p_21 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01) - + switcheroo::mult(s[v]()()(), hs_01); - Simd p_22 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02) - + switcheroo::mult(s[v]()()(), hs_02); - Simd p_30 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10) - + switcheroo::mult(s[v]()()(), hs_10); - Simd p_31 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11) - + switcheroo::mult(s[v]()()(), hs_11); - Simd p_32 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12) - : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12) - + switcheroo::mult(s[v]()()(), hs_12); - - vstream(chi[ss+v]()(0)(0), p_00); - vstream(chi[ss+v]()(0)(1), p_01); - vstream(chi[ss+v]()(0)(2), p_02); - vstream(chi[ss+v]()(1)(0), p_10); - vstream(chi[ss+v]()(1)(1), p_11); - vstream(chi[ss+v]()(1)(2), p_12); - vstream(chi[ss+v]()(2)(0), p_20); - vstream(chi[ss+v]()(2)(1), p_21); - vstream(chi[ss+v]()(2)(2), p_22); - vstream(chi[ss+v]()(3)(0), p_30); - vstream(chi[ss+v]()(3)(1), p_31); - vstream(chi[ss+v]()(3)(2), p_32); - - } - + if(vm >= v){ + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } - this->M5Dtime += usecond(); + Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12); + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12); + + vstream(chi[ss+v]()(0)(0), p_00); + vstream(chi[ss+v]()(0)(1), p_01); + vstream(chi[ss+v]()(0)(2), p_02); + vstream(chi[ss+v]()(1)(0), p_10); + vstream(chi[ss+v]()(1)(1), p_11); + vstream(chi[ss+v]()(1)(2), p_12); + vstream(chi[ss+v]()(2)(0), p_20); + vstream(chi[ss+v]()(2)(1), p_21); + vstream(chi[ss+v]()(2)(2), p_22); + vstream(chi[ss+v]()(3)(0), p_30); + vstream(chi[ss+v]()(3)(1), p_31); + vstream(chi[ss+v]()(3)(2), p_32); + + } + +#endif - #endif } - #ifdef AVX512 - #include - #include - #include - #endif + this->M5Dtime += usecond(); +} - template - void MobiusEOFAFermion::MooeeInternalAsm(const FermionField& psi, FermionField& chi, - int LLs, int site, Vector >& Matp, Vector >& Matm) +template +void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const FermionField& phi, + FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs) +{ +#if 0 + + this->M5Ddag(psi, phi, chi, lower, diag, upper); + + // FIXME: possible gain from vectorizing shift operation as well? + Coeff_t one(1.0); + int Ls = this->Ls; + for(int s=0; spm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); } + else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); } + } + +#else + + GridBase* grid = psi._grid; + int Ls = this->Ls; + int LLs = grid->_rdimensions[0]; + int nsimd = Simd::Nsimd(); + + Vector> u(LLs); + Vector> l(LLs); + Vector> d(LLs); + Vector> s(LLs); + + assert(Ls/LLs == nsimd); + assert(phi.checkerboard == psi.checkerboard); + + chi.checkerboard = psi.checkerboard; + + // just directly address via type pun + typedef typename Simd::scalar_type scalar_type; + scalar_type* u_p = (scalar_type*) &u[0]; + scalar_type* l_p = (scalar_type*) &l[0]; + scalar_type* d_p = (scalar_type*) &d[0]; + scalar_type* s_p = (scalar_type*) &s[0]; + + for(int o=0; oM5Dcalls++; + this->M5Dtime -= usecond(); + + parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs + + int vs = (this->pm == 1) ? LLs-1 : 0; + Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0); + Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1); + Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2); + Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0); + Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1); + Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2); + + for(int v=0; v(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + + if(this->pm == 1 && vs <= v){ + hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v); + hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v); + hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v); + hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v); + hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v); + hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v); + } + + if(vm >= v){ + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } + + if(this->pm == -1 && vs >= v){ + hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v); + hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v); + hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v); + hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v); + hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v); + hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v); + } + + Simd p_00 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00) + + switcheroo::mult(s[v]()()(), hs_00) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00); + Simd p_01 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01) + + switcheroo::mult(s[v]()()(), hs_01) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01); + Simd p_02 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02) + + switcheroo::mult(s[v]()()(), hs_02) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02); + Simd p_10 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10) + + switcheroo::mult(s[v]()()(), hs_10) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10); + Simd p_11 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11) + + switcheroo::mult(s[v]()()(), hs_11) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11); + Simd p_12 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12) + + switcheroo::mult(s[v]()()(), hs_12) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12); + Simd p_20 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00) + + switcheroo::mult(s[v]()()(), hs_00); + Simd p_21 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01) + + switcheroo::mult(s[v]()()(), hs_01); + Simd p_22 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02) + + switcheroo::mult(s[v]()()(), hs_02); + Simd p_30 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10) + + switcheroo::mult(s[v]()()(), hs_10); + Simd p_31 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11) + + switcheroo::mult(s[v]()()(), hs_11); + Simd p_32 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12) + : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12) + + switcheroo::mult(s[v]()()(), hs_12); + + vstream(chi[ss+v]()(0)(0), p_00); + vstream(chi[ss+v]()(0)(1), p_01); + vstream(chi[ss+v]()(0)(2), p_02); + vstream(chi[ss+v]()(1)(0), p_10); + vstream(chi[ss+v]()(1)(1), p_11); + vstream(chi[ss+v]()(1)(2), p_12); + vstream(chi[ss+v]()(2)(0), p_20); + vstream(chi[ss+v]()(2)(1), p_21); + vstream(chi[ss+v]()(2)(2), p_22); + vstream(chi[ss+v]()(3)(0), p_30); + vstream(chi[ss+v]()(3)(1), p_31); + vstream(chi[ss+v]()(3)(2), p_32); + + } + + } + + this->M5Dtime += usecond(); + +#endif +} + +#ifdef AVX512 +#include +#include +#include +#endif + +template +void MobiusEOFAFermion::MooeeInternalAsm(const FermionField& psi, FermionField& chi, + int LLs, int site, Vector >& Matp, Vector >& Matm) +{ +#ifndef AVX512 { - #ifndef AVX512 - { - SiteHalfSpinor BcastP; - SiteHalfSpinor BcastM; - SiteHalfSpinor SiteChiP; - SiteHalfSpinor SiteChiM; + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; - // Ls*Ls * 2 * 12 * vol flops - for(int s1=0; s1); - - for(int s1=0; s1 - void MobiusEOFAFermion::MooeeInternalZAsm(const FermionField& psi, FermionField& chi, - int LLs, int site, Vector >& Matp, Vector >& Matm) + } + } +#else { - std::cout << "Error: zMobius not implemented for EOFA" << std::endl; - exit(-1); - }; + // pointers + // MASK_REGS; +#define Chi_00 %%zmm1 +#define Chi_01 %%zmm2 +#define Chi_02 %%zmm3 +#define Chi_10 %%zmm4 +#define Chi_11 %%zmm5 +#define Chi_12 %%zmm6 +#define Chi_20 %%zmm7 +#define Chi_21 %%zmm8 +#define Chi_22 %%zmm9 +#define Chi_30 %%zmm10 +#define Chi_31 %%zmm11 +#define Chi_32 %%zmm12 - template - void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv) - { - int Ls = this->Ls; - int LLs = psi._grid->_rdimensions[0]; - int vol = psi._grid->oSites()/LLs; +#define BCAST0 %%zmm13 +#define BCAST1 %%zmm14 +#define BCAST2 %%zmm15 +#define BCAST3 %%zmm16 +#define BCAST4 %%zmm17 +#define BCAST5 %%zmm18 +#define BCAST6 %%zmm19 +#define BCAST7 %%zmm20 +#define BCAST8 %%zmm21 +#define BCAST9 %%zmm22 +#define BCAST10 %%zmm23 +#define BCAST11 %%zmm24 - chi.checkerboard = psi.checkerboard; + int incr = LLs*LLs*sizeof(iSinglet); - Vector> Matp; - Vector> Matm; - Vector>* _Matp; - Vector>* _Matm; + for(int s1=0; s1MatpInvDag; - _Matm = &this->MatmInvDag; - } + for(int s2=0; s2MatpInv; - _Matm = &this->MatmInv; - } + int lex = s2 + LLs*site; + uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable + uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1]; + uint64_t a2 = (uint64_t) &psi[lex]; - if(!inv){ - MooeeInternalCompute(dag, inv, Matp, Matm); - _Matp = &Matp; - _Matm = &Matm; - } + for(int l=0; lsize() == Ls*LLs); + if((s2+l)==0) { + asm( + VPREFETCH1(0,%2) VPREFETCH1(0,%1) + VPREFETCH1(12,%2) VPREFETCH1(13,%2) + VPREFETCH1(14,%2) VPREFETCH1(15,%2) + VBCASTCDUP(0,%2,BCAST0) + VBCASTCDUP(1,%2,BCAST1) + VBCASTCDUP(2,%2,BCAST2) + VBCASTCDUP(3,%2,BCAST3) + VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00) + VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01) + VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02) + VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10) + VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11) + VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12) + VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20) + VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21) + VMULMEM(0,%1,BCAST8,Chi_22) + VMULMEM(0,%1,BCAST9,Chi_30) + VMULMEM(0,%1,BCAST10,Chi_31) + VMULMEM(0,%1,BCAST11,Chi_32) + : : "r" (a0), "r" (a1), "r" (a2) ); + } else { + asm( + VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00) + VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01) + VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02) + VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10) + VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11) + VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12) + VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20) + VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21) + VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22) + VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30) + VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31) + VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32) + : : "r" (a0), "r" (a1), "r" (a2) ); + } - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); - - if(switcheroo::iscomplex()){ - parallel_for(auto site=0; siteMooeeInvTime += usecond(); } - #ifdef MOBIUS_EOFA_DPERP_VEC +#undef Chi_00 +#undef Chi_01 +#undef Chi_02 +#undef Chi_10 +#undef Chi_11 +#undef Chi_12 +#undef Chi_20 +#undef Chi_21 +#undef Chi_22 +#undef Chi_30 +#undef Chi_31 +#undef Chi_32 - INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD); - INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF); - INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD); - INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF); +#undef BCAST0 +#undef BCAST1 +#undef BCAST2 +#undef BCAST3 +#undef BCAST4 +#undef BCAST5 +#undef BCAST6 +#undef BCAST7 +#undef BCAST8 +#undef BCAST9 +#undef BCAST10 +#undef BCAST11 - INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF); - INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH); - INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF); - INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH); +#endif +}; - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +// Z-mobius version +template +void MobiusEOFAFermion::MooeeInternalZAsm(const FermionField& psi, FermionField& chi, + int LLs, int site, Vector >& Matp, Vector >& Matm) +{ + std::cout << "Error: zMobius not implemented for EOFA" << std::endl; + exit(-1); +}; - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); - template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template +void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv) +{ + int Ls = this->Ls; + int LLs = psi._grid->_rdimensions[0]; + int vol = psi._grid->oSites()/LLs; - #endif + chi.checkerboard = psi.checkerboard; -}} + Vector> Matp; + Vector> Matm; + Vector>* _Matp; + Vector>* _Matm; + + // MooeeInternalCompute(dag,inv,Matp,Matm); + if(inv && dag){ + _Matp = &this->MatpInvDag; + _Matm = &this->MatmInvDag; + } + + if(inv && (!dag)){ + _Matp = &this->MatpInv; + _Matm = &this->MatmInv; + } + + if(!inv){ + MooeeInternalCompute(dag, inv, Matp, Matm); + _Matp = &Matp; + _Matm = &Matm; + } + + assert(_Matp->size() == Ls*LLs); + + this->MooeeInvCalls++; + this->MooeeInvTime -= usecond(); + + if(switcheroo::iscomplex()){ + parallel_for(auto site=0; siteMooeeInvTime += usecond(); +} + +#ifdef MOBIUS_EOFA_DPERP_VEC + +INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD); +INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF); +INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD); +INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF); + +INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF); +INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH); +INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF); +INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH); + +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); + +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); +template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); + +#endif + +NAMESPACE_END(Grid);