/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc Copyright (C) 2017 Author: Peter Boyle Author: Peter Boyle Author: Peter Boyle Author: paboyle Author: David Murphy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include #include namespace Grid { namespace QCD { /* * Dense matrix versions of routines */ template void MobiusEOFAFermion::MooeeInv(const FermionField& psi, FermionField& chi) { this->MooeeInternal(psi, chi, DaggerNo, InverseYes); } template void MobiusEOFAFermion::MooeeInv_shift(const FermionField& psi, FermionField& chi) { this->MooeeInternal(psi, chi, DaggerNo, InverseYes); } template void MobiusEOFAFermion::MooeeInvDag(const FermionField& psi, FermionField& chi) { this->MooeeInternal(psi, chi, DaggerYes, InverseYes); } template void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField& psi, FermionField& chi) { this->MooeeInternal(psi, chi, DaggerYes, InverseYes); } template void MobiusEOFAFermion::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) { GridBase* grid = psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd = Simd::Nsimd(); Vector> u(LLs); Vector> l(LLs); Vector> d(LLs); assert(Ls/LLs == nsimd); assert(phi.checkerboard == psi.checkerboard); chi.checkerboard = psi.checkerboard; // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type* u_p = (scalar_type*) &u[0]; scalar_type* l_p = (scalar_type*) &l[0]; scalar_type* d_p = (scalar_type*) &d[0]; for(int o=0; oM5Dcalls++; this->M5Dtime -= usecond(); assert(Nc == 3); parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; alignas(64) SiteSpinor fm; for(int v=0; v= v){ rotate(hm, hm, nsimd-1); } hp = 0.5*hp; hm = 0.5*hm; spRecon5m(fp, hp); spRecon5p(fm, hm); chi[ss+v] = d[v]*phi[ss+v]; chi[ss+v] = chi[ss+v] + u[v]*fp; chi[ss+v] = chi[ss+v] + l[v]*fm; } #else for(int v=0; v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } if(vm >= v){ hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } // Can force these to real arithmetic and save 2x. Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00); Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01); Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02); Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10); Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11); Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12); Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00); Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01); Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02); Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10); Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11); Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12); vstream(chi[ss+v]()(0)(0), p_00); vstream(chi[ss+v]()(0)(1), p_01); vstream(chi[ss+v]()(0)(2), p_02); vstream(chi[ss+v]()(1)(0), p_10); vstream(chi[ss+v]()(1)(1), p_11); vstream(chi[ss+v]()(1)(2), p_12); vstream(chi[ss+v]()(2)(0), p_20); vstream(chi[ss+v]()(2)(1), p_21); vstream(chi[ss+v]()(2)(2), p_22); vstream(chi[ss+v]()(3)(0), p_30); vstream(chi[ss+v]()(3)(1), p_31); vstream(chi[ss+v]()(3)(2), p_32); } #endif } this->M5Dtime += usecond(); } template void MobiusEOFAFermion::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, std::vector& shift_coeffs) { #if 0 this->M5D(psi, phi, chi, lower, diag, upper); // FIXME: possible gain from vectorizing shift operation as well? Coeff_t one(1.0); int Ls = this->Ls; for(int s=0; spm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); } else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); } } #else GridBase* grid = psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd = Simd::Nsimd(); Vector> u(LLs); Vector> l(LLs); Vector> d(LLs); Vector> s(LLs); assert(Ls/LLs == nsimd); assert(phi.checkerboard == psi.checkerboard); chi.checkerboard = psi.checkerboard; // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type* u_p = (scalar_type*) &u[0]; scalar_type* l_p = (scalar_type*) &l[0]; scalar_type* d_p = (scalar_type*) &d[0]; scalar_type* s_p = (scalar_type*) &s[0]; for(int o=0; oM5Dcalls++; this->M5Dtime -= usecond(); assert(Nc == 3); parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs int vs = (this->pm == 1) ? LLs-1 : 0; Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0); Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1); Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2); Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0); Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1); Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2); for(int v=0; v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } if(this->pm == 1 && vs <= v){ hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v); hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v); hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v); hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v); hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v); hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v); } if(vm >= v){ hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } if(this->pm == -1 && vs >= v){ hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v); hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v); hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v); hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v); hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v); hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v); } // Can force these to real arithmetic and save 2x. Simd p_00 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00) : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(), hm_00) + switcheroo::mult(s[v]()()(), hs_00); Simd p_01 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01) : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(), hm_01) + switcheroo::mult(s[v]()()(), hs_01); Simd p_02 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02) : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(), hm_02) + switcheroo::mult(s[v]()()(), hs_02); Simd p_10 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10) : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(), hm_10) + switcheroo::mult(s[v]()()(), hs_10); Simd p_11 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11) : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(), hm_11) + switcheroo::mult(s[v]()()(), hs_11); Simd p_12 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12) : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(), hm_12) + switcheroo::mult(s[v]()()(), hs_12); Simd p_20 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00) + switcheroo::mult(s[v]()()(), hs_00) : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(), hp_00); Simd p_21 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01) + switcheroo::mult(s[v]()()(), hs_01) : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(), hp_01); Simd p_22 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02) + switcheroo::mult(s[v]()()(), hs_02) : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(), hp_02); Simd p_30 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10) + switcheroo::mult(s[v]()()(), hs_10) : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(), hp_10); Simd p_31 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11) + switcheroo::mult(s[v]()()(), hs_11) : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(), hp_11); Simd p_32 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12) + switcheroo::mult(s[v]()()(), hs_12) : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(), hp_12); vstream(chi[ss+v]()(0)(0), p_00); vstream(chi[ss+v]()(0)(1), p_01); vstream(chi[ss+v]()(0)(2), p_02); vstream(chi[ss+v]()(1)(0), p_10); vstream(chi[ss+v]()(1)(1), p_11); vstream(chi[ss+v]()(1)(2), p_12); vstream(chi[ss+v]()(2)(0), p_20); vstream(chi[ss+v]()(2)(1), p_21); vstream(chi[ss+v]()(2)(2), p_22); vstream(chi[ss+v]()(3)(0), p_30); vstream(chi[ss+v]()(3)(1), p_31); vstream(chi[ss+v]()(3)(2), p_32); } } this->M5Dtime += usecond(); #endif } template void MobiusEOFAFermion::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper) { GridBase* grid = psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd = Simd::Nsimd(); Vector> u(LLs); Vector> l(LLs); Vector> d(LLs); assert(Ls/LLs == nsimd); assert(phi.checkerboard == psi.checkerboard); chi.checkerboard = psi.checkerboard; // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type* u_p = (scalar_type*) &u[0]; scalar_type* l_p = (scalar_type*) &l[0]; scalar_type* d_p = (scalar_type*) &d[0]; for(int o=0; oM5Dcalls++; this->M5Dtime -= usecond(); parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; alignas(64) SiteSpinor fm; for(int v=0; v= v){ rotate(hm, hm, nsimd-1); } hp = hp*0.5; hm = hm*0.5; spRecon5p(fp, hp); spRecon5m(fm, hm); chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; chi[ss+v] = chi[ss+v] +l[v]*fm; } #else for(int v=0; v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } if(vm >= v){ hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00); Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01); Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02); Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10); Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11); Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12); Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00); Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01); Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02); Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10); Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11); Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12); vstream(chi[ss+v]()(0)(0), p_00); vstream(chi[ss+v]()(0)(1), p_01); vstream(chi[ss+v]()(0)(2), p_02); vstream(chi[ss+v]()(1)(0), p_10); vstream(chi[ss+v]()(1)(1), p_11); vstream(chi[ss+v]()(1)(2), p_12); vstream(chi[ss+v]()(2)(0), p_20); vstream(chi[ss+v]()(2)(1), p_21); vstream(chi[ss+v]()(2)(2), p_22); vstream(chi[ss+v]()(3)(0), p_30); vstream(chi[ss+v]()(3)(1), p_31); vstream(chi[ss+v]()(3)(2), p_32); } #endif } this->M5Dtime += usecond(); } template void MobiusEOFAFermion::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, std::vector& lower, std::vector& diag, std::vector& upper, std::vector& shift_coeffs) { #if 0 this->M5Ddag(psi, phi, chi, lower, diag, upper); // FIXME: possible gain from vectorizing shift operation as well? Coeff_t one(1.0); int Ls = this->Ls; for(int s=0; spm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); } else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); } } #else GridBase* grid = psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd = Simd::Nsimd(); Vector> u(LLs); Vector> l(LLs); Vector> d(LLs); Vector> s(LLs); assert(Ls/LLs == nsimd); assert(phi.checkerboard == psi.checkerboard); chi.checkerboard = psi.checkerboard; // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type* u_p = (scalar_type*) &u[0]; scalar_type* l_p = (scalar_type*) &l[0]; scalar_type* d_p = (scalar_type*) &d[0]; scalar_type* s_p = (scalar_type*) &s[0]; for(int o=0; oM5Dcalls++; this->M5Dtime -= usecond(); parallel_for(int ss=0; ssoSites(); ss+=LLs){ // adds LLs int vs = (this->pm == 1) ? LLs-1 : 0; Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0); Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1); Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2); Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0); Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1); Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2); for(int v=0; v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } if(this->pm == 1 && vs <= v){ hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v); hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v); hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v); hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v); hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v); hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v); } if(vm >= v){ hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } if(this->pm == -1 && vs >= v){ hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v); hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v); hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v); hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v); hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v); hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v); } Simd p_00 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00) + switcheroo::mult(s[v]()()(), hs_00) : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(), hp_00); Simd p_01 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01) + switcheroo::mult(s[v]()()(), hs_01) : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(), hp_01); Simd p_02 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02) + switcheroo::mult(s[v]()()(), hs_02) : switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(), hp_02); Simd p_10 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10) + switcheroo::mult(s[v]()()(), hs_10) : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(), hp_10); Simd p_11 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11) + switcheroo::mult(s[v]()()(), hs_11) : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(), hp_11); Simd p_12 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12) + switcheroo::mult(s[v]()()(), hs_12) : switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(), hp_12); Simd p_20 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00) : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(), hm_00) + switcheroo::mult(s[v]()()(), hs_00); Simd p_21 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01) : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(), hm_01) + switcheroo::mult(s[v]()()(), hs_01); Simd p_22 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02) : switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(), hm_02) + switcheroo::mult(s[v]()()(), hs_02); Simd p_30 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10) : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(), hm_10) + switcheroo::mult(s[v]()()(), hs_10); Simd p_31 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11) : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(), hm_11) + switcheroo::mult(s[v]()()(), hs_11); Simd p_32 = (this->pm == 1) ? switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12) : switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(), hm_12) + switcheroo::mult(s[v]()()(), hs_12); vstream(chi[ss+v]()(0)(0), p_00); vstream(chi[ss+v]()(0)(1), p_01); vstream(chi[ss+v]()(0)(2), p_02); vstream(chi[ss+v]()(1)(0), p_10); vstream(chi[ss+v]()(1)(1), p_11); vstream(chi[ss+v]()(1)(2), p_12); vstream(chi[ss+v]()(2)(0), p_20); vstream(chi[ss+v]()(2)(1), p_21); vstream(chi[ss+v]()(2)(2), p_22); vstream(chi[ss+v]()(3)(0), p_30); vstream(chi[ss+v]()(3)(1), p_31); vstream(chi[ss+v]()(3)(2), p_32); } } this->M5Dtime += usecond(); #endif } #ifdef AVX512 #include #include #include #endif template void MobiusEOFAFermion::MooeeInternalAsm(const FermionField& psi, FermionField& chi, int LLs, int site, Vector >& Matp, Vector >& Matm) { #ifndef AVX512 { SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; SiteHalfSpinor SiteChiP; SiteHalfSpinor SiteChiM; // Ls*Ls * 2 * 12 * vol flops for(int s1=0; s1); for(int s1=0; s1 void MobiusEOFAFermion::MooeeInternalZAsm(const FermionField& psi, FermionField& chi, int LLs, int site, Vector >& Matp, Vector >& Matm) { std::cout << "Error: zMobius not implemented for EOFA" << std::endl; exit(-1); }; template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv) { int Ls = this->Ls; int LLs = psi._grid->_rdimensions[0]; int vol = psi._grid->oSites()/LLs; chi.checkerboard = psi.checkerboard; Vector> Matp; Vector> Matm; Vector>* _Matp; Vector>* _Matm; // MooeeInternalCompute(dag,inv,Matp,Matm); if(inv && dag){ _Matp = &this->MatpInvDag; _Matm = &this->MatmInvDag; } if(inv && (!dag)){ _Matp = &this->MatpInv; _Matm = &this->MatmInv; } if(!inv){ MooeeInternalCompute(dag, inv, Matp, Matm); _Matp = &Matp; _Matm = &Matm; } assert(_Matp->size() == Ls*LLs); this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); if(switcheroo::iscomplex()){ parallel_for(auto site=0; siteMooeeInvTime += usecond(); } #ifdef MOBIUS_EOFA_DPERP_VEC INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD); INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF); INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD); INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF); INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF); INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH); INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF); INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); template void MobiusEOFAFermion::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); #endif }}