mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Vectorized Mobius EOFA Dperp + shift operation
This commit is contained in:
parent
459f70e8d4
commit
dd8f1ea189
@ -213,15 +213,180 @@ namespace QCD {
|
|||||||
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||||
std::vector<Coeff_t>& shift_coeffs)
|
std::vector<Coeff_t>& shift_coeffs)
|
||||||
{
|
{
|
||||||
this->M5D(psi, phi, chi, lower, diag, upper);
|
#if 0
|
||||||
|
|
||||||
// FIXME: possible gain from vectorizing shift operation as well?
|
this->M5D(psi, phi, chi, lower, diag, upper);
|
||||||
Coeff_t one(1.0);
|
|
||||||
int Ls = this->Ls;
|
// FIXME: possible gain from vectorizing shift operation as well?
|
||||||
for(int s=0; s<Ls; s++){
|
Coeff_t one(1.0);
|
||||||
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
|
int Ls = this->Ls;
|
||||||
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
|
for(int s=0; s<Ls; s++){
|
||||||
}
|
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
|
||||||
|
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
GridBase* grid = psi._grid;
|
||||||
|
int Ls = this->Ls;
|
||||||
|
int LLs = grid->_rdimensions[0];
|
||||||
|
const int nsimd = Simd::Nsimd();
|
||||||
|
|
||||||
|
Vector<iSinglet<Simd>> u(LLs);
|
||||||
|
Vector<iSinglet<Simd>> l(LLs);
|
||||||
|
Vector<iSinglet<Simd>> d(LLs);
|
||||||
|
Vector<iSinglet<Simd>> s(LLs);
|
||||||
|
|
||||||
|
assert(Ls/LLs == nsimd);
|
||||||
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
|
|
||||||
|
chi.checkerboard = psi.checkerboard;
|
||||||
|
|
||||||
|
// just directly address via type pun
|
||||||
|
typedef typename Simd::scalar_type scalar_type;
|
||||||
|
scalar_type* u_p = (scalar_type*) &u[0];
|
||||||
|
scalar_type* l_p = (scalar_type*) &l[0];
|
||||||
|
scalar_type* d_p = (scalar_type*) &d[0];
|
||||||
|
scalar_type* s_p = (scalar_type*) &s[0];
|
||||||
|
|
||||||
|
for(int o=0; o<LLs; o++){ // outer
|
||||||
|
for(int i=0; i<nsimd; i++){ //inner
|
||||||
|
int s = o + i*LLs;
|
||||||
|
int ss = o*nsimd + i;
|
||||||
|
u_p[ss] = upper[s];
|
||||||
|
l_p[ss] = lower[s];
|
||||||
|
d_p[ss] = diag[s];
|
||||||
|
s_p[ss] = shift_coeffs[s];
|
||||||
|
}}
|
||||||
|
|
||||||
|
this->M5Dcalls++;
|
||||||
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
|
assert(Nc == 3);
|
||||||
|
|
||||||
|
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
||||||
|
|
||||||
|
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||||
|
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
|
||||||
|
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
|
||||||
|
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
|
||||||
|
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
|
||||||
|
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
|
||||||
|
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
|
||||||
|
|
||||||
|
for(int v=0; v<LLs; v++){
|
||||||
|
|
||||||
|
vprefetch(psi[ss+v+LLs]);
|
||||||
|
|
||||||
|
int vp = (v == LLs-1) ? 0 : v+1;
|
||||||
|
int vm = (v == 0) ? LLs-1 : v-1;
|
||||||
|
|
||||||
|
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||||
|
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||||
|
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||||
|
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||||
|
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||||
|
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||||
|
|
||||||
|
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||||
|
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||||
|
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||||
|
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||||
|
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||||
|
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||||
|
|
||||||
|
if(vp <= v){
|
||||||
|
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||||
|
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||||
|
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||||
|
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||||
|
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||||
|
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(this->pm == 1 && vs <= v){
|
||||||
|
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
|
||||||
|
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
|
||||||
|
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
|
||||||
|
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
|
||||||
|
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
|
||||||
|
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(vm >= v){
|
||||||
|
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||||
|
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||||
|
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||||
|
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||||
|
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||||
|
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(this->pm == -1 && vs >= v){
|
||||||
|
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
|
||||||
|
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
|
||||||
|
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
|
||||||
|
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
|
||||||
|
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
|
||||||
|
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can force these to real arithmetic and save 2x.
|
||||||
|
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
|
||||||
|
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
|
||||||
|
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
|
||||||
|
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
|
||||||
|
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
|
||||||
|
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
|
||||||
|
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||||
|
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||||
|
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||||
|
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||||
|
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||||
|
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||||
|
|
||||||
|
vstream(chi[ss+v]()(0)(0), p_00);
|
||||||
|
vstream(chi[ss+v]()(0)(1), p_01);
|
||||||
|
vstream(chi[ss+v]()(0)(2), p_02);
|
||||||
|
vstream(chi[ss+v]()(1)(0), p_10);
|
||||||
|
vstream(chi[ss+v]()(1)(1), p_11);
|
||||||
|
vstream(chi[ss+v]()(1)(2), p_12);
|
||||||
|
vstream(chi[ss+v]()(2)(0), p_20);
|
||||||
|
vstream(chi[ss+v]()(2)(1), p_21);
|
||||||
|
vstream(chi[ss+v]()(2)(2), p_22);
|
||||||
|
vstream(chi[ss+v]()(3)(0), p_30);
|
||||||
|
vstream(chi[ss+v]()(3)(1), p_31);
|
||||||
|
vstream(chi[ss+v]()(3)(2), p_32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this->M5Dtime += usecond();
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -233,9 +398,9 @@ namespace QCD {
|
|||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
int nsimd = Simd::Nsimd();
|
int nsimd = Simd::Nsimd();
|
||||||
|
|
||||||
Vector<iSinglet<Simd> > u(LLs);
|
Vector<iSinglet<Simd>> u(LLs);
|
||||||
Vector<iSinglet<Simd> > l(LLs);
|
Vector<iSinglet<Simd>> l(LLs);
|
||||||
Vector<iSinglet<Simd> > d(LLs);
|
Vector<iSinglet<Simd>> d(LLs);
|
||||||
|
|
||||||
assert(Ls/LLs == nsimd);
|
assert(Ls/LLs == nsimd);
|
||||||
assert(phi.checkerboard == psi.checkerboard);
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
@ -371,15 +536,179 @@ namespace QCD {
|
|||||||
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||||
std::vector<Coeff_t>& shift_coeffs)
|
std::vector<Coeff_t>& shift_coeffs)
|
||||||
{
|
{
|
||||||
this->M5Ddag(psi, phi, chi, lower, diag, upper);
|
#if 0
|
||||||
|
|
||||||
// FIXME: possible gain from vectorizing shift operation as well?
|
this->M5Ddag(psi, phi, chi, lower, diag, upper);
|
||||||
Coeff_t one(1.0);
|
|
||||||
int Ls = this->Ls;
|
// FIXME: possible gain from vectorizing shift operation as well?
|
||||||
for(int s=0; s<Ls; s++){
|
Coeff_t one(1.0);
|
||||||
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
|
int Ls = this->Ls;
|
||||||
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
|
for(int s=0; s<Ls; s++){
|
||||||
}
|
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
|
||||||
|
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
GridBase* grid = psi._grid;
|
||||||
|
int Ls = this->Ls;
|
||||||
|
int LLs = grid->_rdimensions[0];
|
||||||
|
int nsimd = Simd::Nsimd();
|
||||||
|
|
||||||
|
Vector<iSinglet<Simd>> u(LLs);
|
||||||
|
Vector<iSinglet<Simd>> l(LLs);
|
||||||
|
Vector<iSinglet<Simd>> d(LLs);
|
||||||
|
Vector<iSinglet<Simd>> s(LLs);
|
||||||
|
|
||||||
|
assert(Ls/LLs == nsimd);
|
||||||
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
|
|
||||||
|
chi.checkerboard = psi.checkerboard;
|
||||||
|
|
||||||
|
// just directly address via type pun
|
||||||
|
typedef typename Simd::scalar_type scalar_type;
|
||||||
|
scalar_type* u_p = (scalar_type*) &u[0];
|
||||||
|
scalar_type* l_p = (scalar_type*) &l[0];
|
||||||
|
scalar_type* d_p = (scalar_type*) &d[0];
|
||||||
|
scalar_type* s_p = (scalar_type*) &s[0];
|
||||||
|
|
||||||
|
for(int o=0; o<LLs; o++){ // outer
|
||||||
|
for(int i=0; i<nsimd; i++){ //inner
|
||||||
|
int s = o + i*LLs;
|
||||||
|
int ss = o*nsimd + i;
|
||||||
|
u_p[ss] = upper[s];
|
||||||
|
l_p[ss] = lower[s];
|
||||||
|
d_p[ss] = diag[s];
|
||||||
|
s_p[ss] = shift_coeffs[s];
|
||||||
|
}}
|
||||||
|
|
||||||
|
this->M5Dcalls++;
|
||||||
|
this->M5Dtime -= usecond();
|
||||||
|
|
||||||
|
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
|
||||||
|
|
||||||
|
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||||
|
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
|
||||||
|
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
|
||||||
|
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
|
||||||
|
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
|
||||||
|
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
|
||||||
|
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
|
||||||
|
|
||||||
|
for(int v=0; v<LLs; v++){
|
||||||
|
|
||||||
|
vprefetch(psi[ss+v+LLs]);
|
||||||
|
|
||||||
|
int vp = (v == LLs-1) ? 0 : v+1;
|
||||||
|
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||||
|
|
||||||
|
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||||
|
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||||
|
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||||
|
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||||
|
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||||
|
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||||
|
|
||||||
|
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||||
|
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||||
|
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||||
|
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||||
|
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||||
|
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||||
|
|
||||||
|
if (vp <= v){
|
||||||
|
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||||
|
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||||
|
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||||
|
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||||
|
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||||
|
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(this->pm == 1 && vs <= v){
|
||||||
|
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
|
||||||
|
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
|
||||||
|
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
|
||||||
|
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
|
||||||
|
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
|
||||||
|
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(vm >= v){
|
||||||
|
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||||
|
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||||
|
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||||
|
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||||
|
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||||
|
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(this->pm == -1 && vs >= v){
|
||||||
|
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
|
||||||
|
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
|
||||||
|
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
|
||||||
|
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
|
||||||
|
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
|
||||||
|
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||||
|
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||||
|
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||||
|
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||||
|
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||||
|
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||||
|
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
|
||||||
|
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
|
||||||
|
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
|
||||||
|
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
|
||||||
|
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
|
||||||
|
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||||
|
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||||
|
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
|
||||||
|
|
||||||
|
vstream(chi[ss+v]()(0)(0), p_00);
|
||||||
|
vstream(chi[ss+v]()(0)(1), p_01);
|
||||||
|
vstream(chi[ss+v]()(0)(2), p_02);
|
||||||
|
vstream(chi[ss+v]()(1)(0), p_10);
|
||||||
|
vstream(chi[ss+v]()(1)(1), p_11);
|
||||||
|
vstream(chi[ss+v]()(1)(2), p_12);
|
||||||
|
vstream(chi[ss+v]()(2)(0), p_20);
|
||||||
|
vstream(chi[ss+v]()(2)(1), p_21);
|
||||||
|
vstream(chi[ss+v]()(2)(2), p_22);
|
||||||
|
vstream(chi[ss+v]()(3)(0), p_30);
|
||||||
|
vstream(chi[ss+v]()(3)(1), p_31);
|
||||||
|
vstream(chi[ss+v]()(3)(2), p_32);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
this->M5Dtime += usecond();
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
|
Loading…
x
Reference in New Issue
Block a user