mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-07 04:35:56 +01:00
Namespace, indent
This commit is contained in:
parent
b4e9211df7
commit
72acb0e48f
@ -1,4 +1,4 @@
|
||||
/*************************************************************************************
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
@ -26,19 +26,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
|
||||
|
||||
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
@ -79,13 +79,13 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
||||
scalar_type * d_p = (scalar_type *)&d[0];
|
||||
|
||||
for(int o=0;o<LLs;o++){ // outer
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o+i*LLs;
|
||||
int ss = o*nsimd+i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o+i*LLs;
|
||||
int ss = o*nsimd+i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
|
||||
M5Dcalls++;
|
||||
@ -95,100 +95,100 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
||||
|
||||
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||
#if 0
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0;v<LLs;v++){
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
int vp=(v+1)%LLs;
|
||||
int vm=(v+LLs-1)%LLs;
|
||||
int vp=(v+1)%LLs;
|
||||
int vm=(v+LLs-1)%LLs;
|
||||
|
||||
spProj5m(hp,psi[ss+vp]);
|
||||
spProj5p(hm,psi[ss+vm]);
|
||||
spProj5m(hp,psi[ss+vp]);
|
||||
spProj5p(hm,psi[ss+vm]);
|
||||
|
||||
if ( vp<=v ) rotate(hp,hp,1);
|
||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||
if ( vp<=v ) rotate(hp,hp,1);
|
||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||
|
||||
hp=0.5*hp;
|
||||
hm=0.5*hm;
|
||||
hp=0.5*hp;
|
||||
hm=0.5*hm;
|
||||
|
||||
spRecon5m(fp,hp);
|
||||
spRecon5p(fm,hm);
|
||||
spRecon5m(fp,hp);
|
||||
spRecon5p(fm,hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] +u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] +u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(int v=0;v<LLs;v++){
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
M5Dtime+=usecond();
|
||||
@ -223,13 +223,13 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
||||
scalar_type * d_p = (scalar_type *)&d[0];
|
||||
|
||||
for(int o=0;o<LLs;o++){ // outer
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o+i*LLs;
|
||||
int ss = o*nsimd+i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o+i*LLs;
|
||||
int ss = o*nsimd+i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
@ -261,71 +261,71 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
||||
|
||||
}
|
||||
#else
|
||||
for(int v=0;v<LLs;v++){
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
|
||||
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
|
||||
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
M5Dtime+=usecond();
|
||||
@ -346,54 +346,54 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
|
||||
{
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
|
||||
int s=s2+l*LLs;
|
||||
int lex=s2+LLs*site;
|
||||
int s=s2+l*LLs;
|
||||
int lex=s2+LLs*site;
|
||||
|
||||
if ( s2==0 && l==0) {
|
||||
SiteChiP=zero;
|
||||
SiteChiM=zero;
|
||||
}
|
||||
if ( s2==0 && l==0) {
|
||||
SiteChiP=zero;
|
||||
SiteChiM=zero;
|
||||
}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||
}}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
|
||||
}}
|
||||
{
|
||||
int lex = s1+LLs*site;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}}
|
||||
{
|
||||
int lex = s1+LLs*site;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm1
|
||||
#define Chi_01 %%zmm2
|
||||
@ -421,68 +421,68 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
|
||||
#define BCAST10 %%zmm23
|
||||
#define BCAST11 %%zmm24
|
||||
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
int lex=s2+LLs*site;
|
||||
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
asm (
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
|
||||
VMULMEM (0,%1,BCAST8,Chi_22)
|
||||
VMULMEM (0,%1,BCAST9,Chi_30)
|
||||
VMULMEM (0,%1,BCAST10,Chi_31)
|
||||
VMULMEM (0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm (
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
a2 = a2+sizeof(Simd::scalar_type);
|
||||
}}
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
int lex=s2+LLs*site;
|
||||
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
asm (
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
|
||||
VMULMEM (0,%1,BCAST8,Chi_22)
|
||||
VMULMEM (0,%1,BCAST9,Chi_30)
|
||||
VMULMEM (0,%1,BCAST10,Chi_31)
|
||||
VMULMEM (0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm (
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
a2 = a2+sizeof(Simd::scalar_type);
|
||||
}}
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
@ -511,63 +511,63 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
|
||||
#endif
|
||||
};
|
||||
|
||||
// Z-mobius version
|
||||
// Z-mobius version
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
|
||||
int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
|
||||
int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
|
||||
{
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
|
||||
int s=s2+l*LLs;
|
||||
int lex=s2+LLs*site;
|
||||
int s=s2+l*LLs;
|
||||
int lex=s2+LLs*site;
|
||||
|
||||
if ( s2==0 && l==0) {
|
||||
SiteChiP=zero;
|
||||
SiteChiM=zero;
|
||||
}
|
||||
if ( s2==0 && l==0) {
|
||||
SiteChiP=zero;
|
||||
SiteChiM=zero;
|
||||
}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||
}}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
|
||||
SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
|
||||
SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
|
||||
}}
|
||||
|
||||
|
||||
}}
|
||||
{
|
||||
int lex = s1+LLs*site;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}}
|
||||
{
|
||||
int lex = s1+LLs*site;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %zmm0
|
||||
#define Chi_01 %zmm1
|
||||
#define Chi_02 %zmm2
|
||||
@ -611,123 +611,123 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
|
||||
#define Mm %zmm26
|
||||
#define Mms %zmm27
|
||||
#define N 8
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
int lex=s2+LLs*site;
|
||||
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
int lex=s2+LLs*site;
|
||||
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VLOAD(0,%r8,Mp)// i r
|
||||
VLOAD(0,%r9,Mm)
|
||||
VSHUF(Mp,Mps) // r i
|
||||
VSHUF(Mm,Mms)
|
||||
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
|
||||
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
|
||||
|
||||
VMULIDUP(0*N,%r10,Mps,Chi_00)
|
||||
VMULIDUP(1*N,%r10,Mps,Chi_01)
|
||||
VMULIDUP(2*N,%r10,Mps,Chi_02)
|
||||
VMULIDUP(3*N,%r10,Mps,Chi_10)
|
||||
VMULIDUP(4*N,%r10,Mps,Chi_11)
|
||||
VMULIDUP(5*N,%r10,Mps,Chi_12)
|
||||
|
||||
VMULIDUP(6*N ,%r10,Mms,Chi_20)
|
||||
VMULIDUP(7*N ,%r10,Mms,Chi_21)
|
||||
VMULIDUP(8*N ,%r10,Mms,Chi_22)
|
||||
VMULIDUP(9*N ,%r10,Mms,Chi_30)
|
||||
VMULIDUP(10*N,%r10,Mms,Chi_31)
|
||||
VMULIDUP(11*N,%r10,Mms,Chi_32)
|
||||
|
||||
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
|
||||
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
|
||||
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
|
||||
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
|
||||
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
|
||||
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
|
||||
|
||||
VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
|
||||
VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
|
||||
VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
|
||||
VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
|
||||
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
|
||||
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
|
||||
);
|
||||
} else {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VLOAD(0,%r8,Mp)
|
||||
VSHUF(Mp,Mps)
|
||||
|
||||
VLOAD(0,%r9,Mm)
|
||||
VSHUF(Mm,Mms)
|
||||
|
||||
VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir
|
||||
VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
|
||||
VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
|
||||
VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
|
||||
VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
|
||||
VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
|
||||
|
||||
VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
|
||||
VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
|
||||
VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
|
||||
VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
|
||||
VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
|
||||
VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
|
||||
|
||||
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir)
|
||||
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr)
|
||||
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
|
||||
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
|
||||
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
|
||||
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
|
||||
|
||||
VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
|
||||
VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
|
||||
VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
|
||||
VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
|
||||
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
|
||||
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
|
||||
);
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
a2 = a2+sizeof(Simd::scalar_type);
|
||||
}}
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
/*
|
||||
SiteSpinor tmp;
|
||||
asm (
|
||||
VLOAD(0,%r8,Mp)// i r
|
||||
VLOAD(0,%r9,Mm)
|
||||
VSHUF(Mp,Mps) // r i
|
||||
VSHUF(Mm,Mms)
|
||||
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
|
||||
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
|
||||
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
|
||||
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
|
||||
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
|
||||
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
|
||||
: : "r" ((uint64_t)&tmp) : "memory" );
|
||||
*/
|
||||
|
||||
VMULIDUP(0*N,%r10,Mps,Chi_00)
|
||||
VMULIDUP(1*N,%r10,Mps,Chi_01)
|
||||
VMULIDUP(2*N,%r10,Mps,Chi_02)
|
||||
VMULIDUP(3*N,%r10,Mps,Chi_10)
|
||||
VMULIDUP(4*N,%r10,Mps,Chi_11)
|
||||
VMULIDUP(5*N,%r10,Mps,Chi_12)
|
||||
asm (
|
||||
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
|
||||
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
|
||||
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
|
||||
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
VMULIDUP(6*N ,%r10,Mms,Chi_20)
|
||||
VMULIDUP(7*N ,%r10,Mms,Chi_21)
|
||||
VMULIDUP(8*N ,%r10,Mms,Chi_22)
|
||||
VMULIDUP(9*N ,%r10,Mms,Chi_30)
|
||||
VMULIDUP(10*N,%r10,Mms,Chi_31)
|
||||
VMULIDUP(11*N,%r10,Mms,Chi_32)
|
||||
|
||||
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
|
||||
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
|
||||
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
|
||||
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
|
||||
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
|
||||
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
|
||||
|
||||
VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
|
||||
VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
|
||||
VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
|
||||
VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
|
||||
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
|
||||
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
|
||||
);
|
||||
} else {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VLOAD(0,%r8,Mp)
|
||||
VSHUF(Mp,Mps)
|
||||
|
||||
VLOAD(0,%r9,Mm)
|
||||
VSHUF(Mm,Mms)
|
||||
|
||||
VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir
|
||||
VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
|
||||
VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
|
||||
VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
|
||||
VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
|
||||
VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
|
||||
|
||||
VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
|
||||
VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
|
||||
VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
|
||||
VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
|
||||
VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
|
||||
VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
|
||||
|
||||
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir)
|
||||
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr)
|
||||
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
|
||||
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
|
||||
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
|
||||
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
|
||||
|
||||
VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
|
||||
VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
|
||||
VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
|
||||
VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
|
||||
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
|
||||
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
|
||||
);
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
a2 = a2+sizeof(Simd::scalar_type);
|
||||
}}
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
/*
|
||||
SiteSpinor tmp;
|
||||
asm (
|
||||
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
|
||||
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
|
||||
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
|
||||
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
|
||||
: : "r" ((uint64_t)&tmp) : "memory" );
|
||||
*/
|
||||
|
||||
asm (
|
||||
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
|
||||
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
|
||||
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
|
||||
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
// if ( 1 || (site==0) ) {
|
||||
// std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
|
||||
// }
|
||||
// if ( 1 || (site==0) ) {
|
||||
// std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
@ -823,6 +823,4 @@ template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const Fermio
|
||||
template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
|
||||
|
||||
|
||||
}}
|
||||
NAMESPACE_END(Grid);
|
||||
|
Loading…
x
Reference in New Issue
Block a user