mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Lots of debug on performance Mobius
This commit is contained in:
parent
ff71a8e847
commit
fb8d4b2357
@ -54,12 +54,11 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
FermionField tmp(psi._grid);
|
|
||||||
|
|
||||||
this->DW(psi,tmp,DaggerNo);
|
this->DW(psi,this->tmp(),DaggerNo);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,8 +86,8 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
|||||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||||
|
|
||||||
// Flops = 9*12*Ls*vol/2
|
// Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
|
||||||
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
}
|
}
|
||||||
@ -110,12 +109,11 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
FermionField tmp(psi._grid);
|
|
||||||
|
|
||||||
this->DW(psi,tmp,DaggerYes);
|
this->DW(psi,this->tmp(),DaggerYes);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -138,6 +136,7 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
|
|||||||
lower[0] =-mass*lower[0];
|
lower[0] =-mass*lower[0];
|
||||||
M5D(psi,psi,Din,lower,diag,upper);
|
M5D(psi,psi,Din,lower,diag,upper);
|
||||||
}
|
}
|
||||||
|
// FIXME Redunant with the above routine; check this and eliminate
|
||||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
@ -259,36 +258,33 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
FermionField tmp(psi._grid);
|
|
||||||
|
|
||||||
Meooe5D(psi,tmp);
|
Meooe5D(psi,this->tmp());
|
||||||
|
|
||||||
if ( psi.checkerboard == Odd ) {
|
if ( psi.checkerboard == Odd ) {
|
||||||
this->DhopEO(tmp,chi,DaggerNo);
|
this->DhopEO(this->tmp(),chi,DaggerNo);
|
||||||
} else {
|
} else {
|
||||||
this->DhopOE(tmp,chi,DaggerNo);
|
this->DhopOE(this->tmp(),chi,DaggerNo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::MeooeDag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::MeooeDag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
FermionField tmp(psi._grid);
|
|
||||||
// Apply 4d dslash
|
// Apply 4d dslash
|
||||||
if ( psi.checkerboard == Odd ) {
|
if ( psi.checkerboard == Odd ) {
|
||||||
this->DhopEO(psi,tmp,DaggerYes);
|
this->DhopEO(psi,this->tmp(),DaggerYes);
|
||||||
} else {
|
} else {
|
||||||
this->DhopOE(psi,tmp,DaggerYes);
|
this->DhopOE(psi,this->tmp(),DaggerYes);
|
||||||
}
|
}
|
||||||
MeooeDag5D(tmp,chi);
|
MeooeDag5D(this->tmp(),chi);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
||||||
FermionField tmp(psi._grid);
|
Meo5D(psi,this->tmp());
|
||||||
Meo5D(psi,tmp);
|
|
||||||
// Apply 4d dslash fragment
|
// Apply 4d dslash fragment
|
||||||
this->DhopDir(tmp,chi,dir,disp);
|
this->DhopDir(this->tmp(),chi,dir,disp);
|
||||||
}
|
}
|
||||||
// force terms; five routines; default to Dhop on diagonal
|
// force terms; five routines; default to Dhop on diagonal
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
|
@ -76,6 +76,11 @@ namespace Grid {
|
|||||||
std::vector<Coeff_t> &diag,
|
std::vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper);
|
std::vector<Coeff_t> &upper);
|
||||||
void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
|
void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
|
||||||
|
void MooeeInternalAsm(const FermionField &in, FermionField &out,
|
||||||
|
int LLs, int site,
|
||||||
|
Vector<iSinglet<Simd> > &Matp,
|
||||||
|
Vector<iSinglet<Simd> > &Matm);
|
||||||
|
|
||||||
|
|
||||||
virtual void Instantiatable(void)=0;
|
virtual void Instantiatable(void)=0;
|
||||||
|
|
||||||
|
@ -34,8 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD { /*
|
||||||
/*
|
|
||||||
* Dense matrix versions of routines
|
* Dense matrix versions of routines
|
||||||
*/
|
*/
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int v=0;v<LLs;v++){
|
for(int v=0;v<LLs;v++){
|
||||||
|
|
||||||
vprefetch(psi[ss+v+LLs]);
|
vprefetch(psi[ss+v+LLs]);
|
||||||
// vprefetch(phi[ss+v+LLs]);
|
|
||||||
|
|
||||||
int vp= (v==LLs-1) ? 0 : v+1;
|
int vp= (v==LLs-1) ? 0 : v+1;
|
||||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||||
@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
|
|||||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||||
|
|
||||||
// if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
|
|
||||||
// if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
|
|
||||||
|
|
||||||
if ( vp<=v ) {
|
if ( vp<=v ) {
|
||||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||||
@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
|
|||||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
// Can force these to real arithmetic and save 2x.
|
||||||
if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
|
Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(l[v]()()(),hm_00);
|
||||||
if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
|
Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(l[v]()()(),hm_01);
|
||||||
if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
|
Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(l[v]()()(),hm_02);
|
||||||
if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
|
Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(l[v]()()(),hm_10);
|
||||||
*/
|
Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(l[v]()()(),hm_11);
|
||||||
Simd p_00 = d[v]()()() * phi[ss+v]()(0)(0) + l[v]()()()*hm_00;
|
Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(l[v]()()(),hm_12);
|
||||||
Simd p_01 = d[v]()()() * phi[ss+v]()(0)(1) + l[v]()()()*hm_01;
|
Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(u[v]()()(),hp_00);
|
||||||
Simd p_02 = d[v]()()() * phi[ss+v]()(0)(2) + l[v]()()()*hm_02;
|
Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(u[v]()()(),hp_01);
|
||||||
Simd p_10 = d[v]()()() * phi[ss+v]()(1)(0) + l[v]()()()*hm_10;
|
Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(u[v]()()(),hp_02);
|
||||||
Simd p_11 = d[v]()()() * phi[ss+v]()(1)(1) + l[v]()()()*hm_11;
|
Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(u[v]()()(),hp_10);
|
||||||
Simd p_12 = d[v]()()() * phi[ss+v]()(1)(2) + l[v]()()()*hm_12;
|
Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(u[v]()()(),hp_11);
|
||||||
Simd p_20 = d[v]()()() * phi[ss+v]()(2)(0) + u[v]()()()*hp_00;
|
Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(u[v]()()(),hp_12);
|
||||||
Simd p_21 = d[v]()()() * phi[ss+v]()(2)(1) + u[v]()()()*hp_01;
|
|
||||||
Simd p_22 = d[v]()()() * phi[ss+v]()(2)(2) + u[v]()()()*hp_02;
|
|
||||||
Simd p_30 = d[v]()()() * phi[ss+v]()(3)(0) + u[v]()()()*hp_10;
|
|
||||||
Simd p_31 = d[v]()()() * phi[ss+v]()(3)(1) + u[v]()()()*hp_11;
|
|
||||||
Simd p_32 = d[v]()()() * phi[ss+v]()(3)(2) + u[v]()()()*hp_12;
|
|
||||||
|
|
||||||
|
|
||||||
// if ( ss==0){
|
|
||||||
/*
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
vstream(chi[ss+v]()(0)(0),p_00);
|
vstream(chi[ss+v]()(0)(0),p_00);
|
||||||
vstream(chi[ss+v]()(0)(1),p_01);
|
vstream(chi[ss+v]()(0)(1),p_01);
|
||||||
vstream(chi[ss+v]()(0)(2),p_02);
|
vstream(chi[ss+v]()(0)(2),p_02);
|
||||||
@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
M5Dtime-=usecond();
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
|
#if 0
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
alignas(64) SiteSpinor fp;
|
alignas(64) SiteSpinor fp;
|
||||||
@ -287,9 +260,231 @@ PARALLEL_FOR_LOOP
|
|||||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
for(int v=0;v<LLs;v++){
|
||||||
|
|
||||||
|
vprefetch(psi[ss+v+LLs]);
|
||||||
|
|
||||||
|
int vp= (v==LLs-1) ? 0 : v+1;
|
||||||
|
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||||
|
|
||||||
|
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||||
|
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||||
|
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||||
|
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||||
|
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||||
|
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||||
|
|
||||||
|
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||||
|
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||||
|
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||||
|
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||||
|
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||||
|
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||||
|
|
||||||
|
if ( vp<=v ) {
|
||||||
|
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||||
|
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||||
|
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||||
|
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||||
|
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||||
|
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||||
|
}
|
||||||
|
if ( vm>=v ) {
|
||||||
|
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||||
|
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||||
|
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||||
|
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||||
|
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||||
|
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
Simd p_00 = real_mult(d[v]()()(), phi[ss+v]()(0)(0)) + real_mult(u[v]()()(),hp_00);
|
||||||
|
Simd p_01 = real_mult(d[v]()()(), phi[ss+v]()(0)(1)) + real_mult(u[v]()()(),hp_01);
|
||||||
|
Simd p_02 = real_mult(d[v]()()(), phi[ss+v]()(0)(2)) + real_mult(u[v]()()(),hp_02);
|
||||||
|
Simd p_10 = real_mult(d[v]()()(), phi[ss+v]()(1)(0)) + real_mult(u[v]()()(),hp_10);
|
||||||
|
Simd p_11 = real_mult(d[v]()()(), phi[ss+v]()(1)(1)) + real_mult(u[v]()()(),hp_11);
|
||||||
|
Simd p_12 = real_mult(d[v]()()(), phi[ss+v]()(1)(2)) + real_mult(u[v]()()(),hp_12);
|
||||||
|
|
||||||
|
Simd p_20 = real_mult(d[v]()()(), phi[ss+v]()(2)(0)) + real_mult(l[v]()()(),hm_00);
|
||||||
|
Simd p_21 = real_mult(d[v]()()(), phi[ss+v]()(2)(1)) + real_mult(l[v]()()(),hm_01);
|
||||||
|
Simd p_22 = real_mult(d[v]()()(), phi[ss+v]()(2)(2)) + real_mult(l[v]()()(),hm_02);
|
||||||
|
Simd p_30 = real_mult(d[v]()()(), phi[ss+v]()(3)(0)) + real_mult(l[v]()()(),hm_10);
|
||||||
|
Simd p_31 = real_mult(d[v]()()(), phi[ss+v]()(3)(1)) + real_mult(l[v]()()(),hm_11);
|
||||||
|
Simd p_32 = real_mult(d[v]()()(), phi[ss+v]()(3)(2)) + real_mult(l[v]()()(),hm_12);
|
||||||
|
|
||||||
|
vstream(chi[ss+v]()(0)(0),p_00);
|
||||||
|
vstream(chi[ss+v]()(0)(1),p_01);
|
||||||
|
vstream(chi[ss+v]()(0)(2),p_02);
|
||||||
|
vstream(chi[ss+v]()(1)(0),p_10);
|
||||||
|
vstream(chi[ss+v]()(1)(1),p_11);
|
||||||
|
vstream(chi[ss+v]()(1)(2),p_12);
|
||||||
|
vstream(chi[ss+v]()(2)(0),p_20);
|
||||||
|
vstream(chi[ss+v]()(2)(1),p_21);
|
||||||
|
vstream(chi[ss+v]()(2)(2),p_22);
|
||||||
|
vstream(chi[ss+v]()(3)(0),p_30);
|
||||||
|
vstream(chi[ss+v]()(3)(1),p_31);
|
||||||
|
vstream(chi[ss+v]()(3)(2),p_32);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
M5Dtime+=usecond();
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#include <simd/Intel512common.h>
|
||||||
|
#include <simd/Intel512avx.h>
|
||||||
|
#include <simd/Intel512single.h>
|
||||||
|
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
|
||||||
|
int LLs, int site,
|
||||||
|
Vector<iSinglet<Simd> > &Matp,
|
||||||
|
Vector<iSinglet<Simd> > &Matm)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
{
|
||||||
|
SiteHalfSpinor BcastP;
|
||||||
|
SiteHalfSpinor BcastM;
|
||||||
|
SiteHalfSpinor SiteChiP;
|
||||||
|
SiteHalfSpinor SiteChiM;
|
||||||
|
|
||||||
|
// Ls*Ls * 2 * 12 * vol flops
|
||||||
|
for(int s1=0;s1<LLs;s1++){
|
||||||
|
for(int s2=0;s2<LLs;s2++){
|
||||||
|
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||||
|
|
||||||
|
int s=s2+l*LLs;
|
||||||
|
int lex=s2+LLs*site;
|
||||||
|
|
||||||
|
if ( s2==0 && l==0) {
|
||||||
|
SiteChiP=zero;
|
||||||
|
SiteChiM=zero;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int sp=0;sp<2;sp++){
|
||||||
|
for(int co=0;co<Nc;co++){
|
||||||
|
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||||
|
}}
|
||||||
|
for(int sp=0;sp<2;sp++){
|
||||||
|
for(int co=0;co<Nc;co++){
|
||||||
|
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||||
|
}}
|
||||||
|
|
||||||
|
for(int sp=0;sp<2;sp++){
|
||||||
|
for(int co=0;co<Nc;co++){
|
||||||
|
SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
|
||||||
|
SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
|
||||||
|
}}
|
||||||
|
|
||||||
|
}}
|
||||||
|
{
|
||||||
|
int lex = s1+LLs*site;
|
||||||
|
for(int sp=0;sp<2;sp++){
|
||||||
|
for(int co=0;co<Nc;co++){
|
||||||
|
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||||
|
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
// pointers
|
||||||
|
// MASK_REGS;
|
||||||
|
#define Chi_00 %%zmm1
|
||||||
|
#define Chi_01 %%zmm2
|
||||||
|
#define Chi_02 %%zmm3
|
||||||
|
#define Chi_10 %%zmm4
|
||||||
|
#define Chi_11 %%zmm5
|
||||||
|
#define Chi_12 %%zmm6
|
||||||
|
#define Chi_20 %%zmm7
|
||||||
|
#define Chi_21 %%zmm8
|
||||||
|
#define Chi_22 %%zmm9
|
||||||
|
#define Chi_30 %%zmm10
|
||||||
|
#define Chi_31 %%zmm11
|
||||||
|
#define Chi_32 %%zmm12
|
||||||
|
|
||||||
|
#define BCAST0 %%zmm13
|
||||||
|
#define BCAST1 %%zmm14
|
||||||
|
#define BCAST2 %%zmm15
|
||||||
|
#define BCAST3 %%zmm16
|
||||||
|
#define BCAST4 %%zmm17
|
||||||
|
#define BCAST5 %%zmm18
|
||||||
|
#define BCAST6 %%zmm19
|
||||||
|
#define BCAST7 %%zmm20
|
||||||
|
#define BCAST8 %%zmm21
|
||||||
|
#define BCAST9 %%zmm22
|
||||||
|
#define BCAST10 %%zmm23
|
||||||
|
#define BCAST11 %%zmm24
|
||||||
|
|
||||||
|
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||||
|
for(int s1=0;s1<LLs;s1++){
|
||||||
|
for(int s2=0;s2<LLs;s2++){
|
||||||
|
int lex=s2+LLs*site;
|
||||||
|
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||||
|
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||||
|
uint64_t a2 = (uint64_t)&psi[lex];
|
||||||
|
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||||
|
if ( (s2+l)==0 ) {
|
||||||
|
asm (
|
||||||
|
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||||
|
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||||
|
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||||
|
VBCASTCDUP(0,%2,BCAST0)
|
||||||
|
VBCASTCDUP(1,%2,BCAST1)
|
||||||
|
VBCASTCDUP(2,%2,BCAST2)
|
||||||
|
VBCASTCDUP(3,%2,BCAST3)
|
||||||
|
VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
|
||||||
|
VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
|
||||||
|
VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
|
||||||
|
VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
|
||||||
|
VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
|
||||||
|
VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
|
||||||
|
VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
|
||||||
|
VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
|
||||||
|
VMULMEM (0,%1,BCAST8,Chi_22)
|
||||||
|
VMULMEM (0,%1,BCAST9,Chi_30)
|
||||||
|
VMULMEM (0,%1,BCAST10,Chi_31)
|
||||||
|
VMULMEM (0,%1,BCAST11,Chi_32)
|
||||||
|
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||||
|
} else {
|
||||||
|
asm (
|
||||||
|
VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
|
||||||
|
VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
|
||||||
|
VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
|
||||||
|
VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
|
||||||
|
VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
|
||||||
|
VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
|
||||||
|
VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
|
||||||
|
VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
|
||||||
|
VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
|
||||||
|
VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
|
||||||
|
VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
|
||||||
|
VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
|
||||||
|
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||||
|
}
|
||||||
|
a0 = a0+incr;
|
||||||
|
a1 = a1+incr;
|
||||||
|
a2 = a2+sizeof(Simd::scalar_type);
|
||||||
|
}}
|
||||||
|
{
|
||||||
|
int lexa = s1+LLs*site;
|
||||||
|
asm (
|
||||||
|
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||||
|
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||||
|
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||||
|
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||||
|
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
||||||
{
|
{
|
||||||
@ -347,32 +542,33 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
scalar_type *sp = (scalar_type *)&Vp;
|
scalar_type *sp = (scalar_type *)&Vp;
|
||||||
scalar_type *sm = (scalar_type *)&Vm;
|
scalar_type *sm = (scalar_type *)&Vm;
|
||||||
for(int l=0;l<Nsimd;l++){
|
for(int l=0;l<Nsimd;l++){
|
||||||
sp[l] = PplusMat (l*istride+s1*ostride ,s2);
|
sp[l] = PplusMat (l*istride+s1*ostride,s2);
|
||||||
|
sp[l] = scalar_type(sp[l].real(),sp[l].real());
|
||||||
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
||||||
|
sm[l] = scalar_type(sm[l].real(),sm[l].real());
|
||||||
}
|
}
|
||||||
Matp[LLs*s2+s1] = Vp;
|
Matp[LLs*s2+s1] = Vp;
|
||||||
Matm[LLs*s2+s1] = Vm;
|
Matm[LLs*s2+s1] = Vm;
|
||||||
}
|
}}
|
||||||
}
|
|
||||||
|
|
||||||
MooeeInvCalls++;
|
MooeeInvCalls++;
|
||||||
MooeeInvTime-=usecond();
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
// Dynamic allocate on stack to get per thread without serialised heap acces
|
// Dynamic allocate on stack to get per thread without serialised heap acces
|
||||||
|
#if 0
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
|
std::vector<SiteHalfSpinor> SitePplus(LLs);
|
||||||
Vector<SiteHalfSpinor> SitePplus(LLs);
|
std::vector<SiteHalfSpinor> SitePminus(LLs);
|
||||||
Vector<SiteHalfSpinor> SitePminus(LLs);
|
std::vector<SiteHalfSpinor> SiteChiP(LLs);
|
||||||
Vector<SiteHalfSpinor> SiteChiP(LLs);
|
std::vector<SiteHalfSpinor> SiteChiM(LLs);
|
||||||
Vector<SiteHalfSpinor> SiteChiM(LLs);
|
std::vector<SiteSpinor> SiteChi(LLs);
|
||||||
Vector<SiteSpinor> SiteChi(LLs);
|
|
||||||
|
|
||||||
SiteHalfSpinor BcastP;
|
|
||||||
SiteHalfSpinor BcastM;
|
|
||||||
|
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for(auto site=0;site<vol;site++){
|
for(auto site=0;site<vol;site++){
|
||||||
|
SiteHalfSpinor BcastP;
|
||||||
|
SiteHalfSpinor BcastM;
|
||||||
for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
int lex = s+LLs*site;
|
int lex = s+LLs*site;
|
||||||
spProj5p(SitePplus[s] ,psi[lex]);
|
spProj5p(SitePplus[s] ,psi[lex]);
|
||||||
@ -391,7 +587,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
|
SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
|
||||||
}
|
}
|
||||||
s++;
|
s++;
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
int lex = s+LLs*site;
|
int lex = s+LLs*site;
|
||||||
@ -399,8 +596,16 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
accumRecon5m(SiteChi[s],SiteChiM[s]);
|
accumRecon5m(SiteChi[s],SiteChiM[s]);
|
||||||
chi[lex] = SiteChi[s]*0.5;
|
chi[lex] = SiteChi[s]*0.5;
|
||||||
}
|
}
|
||||||
|
}}
|
||||||
|
#else
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(auto site=0;site<vol;site++){
|
||||||
|
MooeeInternalAsm(psi,chi,
|
||||||
|
LLs,site,
|
||||||
|
Matp,Matm);
|
||||||
}
|
}
|
||||||
}
|
#endif
|
||||||
|
|
||||||
MooeeInvTime+=usecond();
|
MooeeInvTime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,6 +48,8 @@ namespace Grid {
|
|||||||
|
|
||||||
FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
|
FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
|
||||||
|
|
||||||
|
virtual FermionField &tmp(void) = 0;
|
||||||
|
|
||||||
GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know
|
GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know
|
||||||
GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
|
GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
|
||||||
|
|
||||||
|
@ -61,7 +61,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
LebesgueEvenOdd(_cbgrid),
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid) {
|
UmuOdd(&Hgrid),
|
||||||
|
_tmp(&Hgrid)
|
||||||
|
{
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
}
|
}
|
||||||
|
@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
GridBase *FermionGrid(void) { return _grid; }
|
GridBase *FermionGrid(void) { return _grid; }
|
||||||
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
|
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
|
||||||
|
|
||||||
|
FermionField _tmp;
|
||||||
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// override multiply; cut number routines if pass dagger argument
|
// override multiply; cut number routines if pass dagger argument
|
||||||
// and also make interface more uniformly consistent
|
// and also make interface more uniformly consistent
|
||||||
|
@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
UmuEven(_FourDimRedBlackGrid),
|
UmuEven(_FourDimRedBlackGrid),
|
||||||
UmuOdd (_FourDimRedBlackGrid),
|
UmuOdd (_FourDimRedBlackGrid),
|
||||||
Lebesgue(_FourDimGrid),
|
Lebesgue(_FourDimGrid),
|
||||||
LebesgueEvenOdd(_FourDimRedBlackGrid)
|
LebesgueEvenOdd(_FourDimRedBlackGrid),
|
||||||
|
_tmp(&FiveDimRedBlackGrid)
|
||||||
{
|
{
|
||||||
if (Impl::LsVectorised) {
|
if (Impl::LsVectorised) {
|
||||||
|
|
||||||
|
@ -74,6 +74,9 @@ namespace QCD {
|
|||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
PmuStat stat;
|
PmuStat stat;
|
||||||
|
|
||||||
|
FermionField _tmp;
|
||||||
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
void Report(void);
|
void Report(void);
|
||||||
void ZeroCounters(void);
|
void ZeroCounters(void);
|
||||||
double DhopCalls;
|
double DhopCalls;
|
||||||
|
Loading…
Reference in New Issue
Block a user