mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Merge branch 'feature/hadrons' into feature/qed-fvol
# Conflicts: # Makefile.am # configure.ac # lib/qcd/action/gauge/Photon.h
This commit is contained in:
@ -57,7 +57,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
////////////////////////////////////////////
|
||||
// Gauge Actions
|
||||
////////////////////////////////////////////
|
||||
#include <Grid/qcd/action/gauge/Photon.h>
|
||||
#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
|
||||
#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
|
||||
|
||||
@ -196,6 +195,7 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
|
||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
|
||||
|
||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
|
||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
|
||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
|
||||
@ -204,6 +204,20 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
|
||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
|
||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
|
||||
|
||||
// Ls vectorised
|
||||
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
|
||||
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
|
||||
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
|
||||
|
||||
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
|
||||
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
|
||||
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
|
||||
|
||||
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
|
||||
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
|
||||
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
|
||||
|
||||
|
||||
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
|
||||
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
|
||||
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
|
||||
@ -255,6 +269,7 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
|
||||
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
|
||||
|
||||
|
||||
|
||||
}}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
||||
|
@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
|
||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
||||
{
|
||||
this->Report();
|
||||
std::vector<int> latt = GridDefaultLatt();
|
||||
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = this->_FourDimGrid->_Nprocessors;
|
||||
if ( M5Dcalls > 0 ) {
|
||||
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
}
|
||||
|
||||
if ( MooeeInvCalls > 0 ) {
|
||||
|
||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||
|
||||
// Flops = 9*12*Ls*vol/2
|
||||
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
|
||||
{
|
||||
this->ZeroCounters();
|
||||
M5Dflops=0;
|
||||
M5Dcalls=0;
|
||||
M5Dtime=0;
|
||||
MooeeInvFlops=0;
|
||||
MooeeInvCalls=0;
|
||||
MooeeInvTime=0;
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
|
@ -120,6 +120,18 @@ namespace Grid {
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
|
||||
|
||||
|
||||
|
||||
void CayleyReport(void);
|
||||
void CayleyZeroCounters(void);
|
||||
|
||||
double M5Dflops;
|
||||
double M5Dcalls;
|
||||
double M5Dtime;
|
||||
|
||||
double MooeeInvFlops;
|
||||
double MooeeInvCalls;
|
||||
double MooeeInvTime;
|
||||
|
||||
protected:
|
||||
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||
|
@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
||||
GridBase *grid=psi._grid;
|
||||
assert(phi.checkerboard == psi.checkerboard);
|
||||
chi.checkerboard=psi.checkerboard;
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||
for(int s=0;s<Ls;s++){
|
||||
@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
}
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
||||
assert(phi.checkerboard == psi.checkerboard);
|
||||
chi.checkerboard=psi.checkerboard;
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||
auto tmp = psi._odata[0];
|
||||
@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
}
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
|
||||
|
||||
chi.checkerboard=psi.checkerboard;
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||
auto tmp = psi._odata[0];
|
||||
|
||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||
// Apply (L^{\prime})^{-1}
|
||||
chi[ss]=psi[ss]; // chi[0]=psi[0]
|
||||
for(int s=1;s<Ls;s++){
|
||||
@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
|
||||
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
||||
}
|
||||
}
|
||||
|
||||
MooeeInvTime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
|
||||
assert(psi.checkerboard == psi.checkerboard);
|
||||
chi.checkerboard=psi.checkerboard;
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||
@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
|
||||
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
|
||||
}
|
||||
}
|
||||
|
||||
MooeeInvTime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
#ifdef CAYLEY_DPERP_CACHE
|
||||
|
@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
||||
GridBase *grid=psi._grid;
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd= Simd::Nsimd();
|
||||
const int nsimd= Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
@ -86,35 +86,138 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
|
||||
assert(Nc==3);
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||
#if 0
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
for(int v=0;v<LLs;v++){
|
||||
int vp=(v+1)%LLs;
|
||||
int vm=(v+LLs-1)%LLs;
|
||||
|
||||
int vp=(v+1)%LLs;
|
||||
int vm=(v+LLs-1)%LLs;
|
||||
spProj5m(hp,psi[ss+vp]);
|
||||
spProj5p(hm,psi[ss+vm]);
|
||||
|
||||
spProj5m(hp,psi[ss+vp]);
|
||||
spProj5p(hm,psi[ss+vm]);
|
||||
|
||||
if ( vp<=v ) rotate(hp,hp,1);
|
||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||
if ( vp<=v ) rotate(hp,hp,1);
|
||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||
|
||||
hp=0.5*hp;
|
||||
hm=0.5*hm;
|
||||
|
||||
hp=hp*0.5;
|
||||
hm=hm*0.5;
|
||||
spRecon5m(fp,hp);
|
||||
spRecon5p(fm,hm);
|
||||
spRecon5m(fp,hp);
|
||||
spRecon5p(fm,hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] +u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
// vprefetch(phi[ss+v+LLs]);
|
||||
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
// if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
|
||||
// if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
|
||||
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
/*
|
||||
if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
|
||||
if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
|
||||
if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
|
||||
if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
|
||||
*/
|
||||
Simd p_00 = d[v]()()() * phi[ss+v]()(0)(0) + l[v]()()()*hm_00;
|
||||
Simd p_01 = d[v]()()() * phi[ss+v]()(0)(1) + l[v]()()()*hm_01;
|
||||
Simd p_02 = d[v]()()() * phi[ss+v]()(0)(2) + l[v]()()()*hm_02;
|
||||
Simd p_10 = d[v]()()() * phi[ss+v]()(1)(0) + l[v]()()()*hm_10;
|
||||
Simd p_11 = d[v]()()() * phi[ss+v]()(1)(1) + l[v]()()()*hm_11;
|
||||
Simd p_12 = d[v]()()() * phi[ss+v]()(1)(2) + l[v]()()()*hm_12;
|
||||
Simd p_20 = d[v]()()() * phi[ss+v]()(2)(0) + u[v]()()()*hp_00;
|
||||
Simd p_21 = d[v]()()() * phi[ss+v]()(2)(1) + u[v]()()()*hp_01;
|
||||
Simd p_22 = d[v]()()() * phi[ss+v]()(2)(2) + u[v]()()()*hp_02;
|
||||
Simd p_30 = d[v]()()() * phi[ss+v]()(3)(0) + u[v]()()()*hp_10;
|
||||
Simd p_31 = d[v]()()() * phi[ss+v]()(3)(1) + u[v]()()()*hp_11;
|
||||
Simd p_32 = d[v]()()() * phi[ss+v]()(3)(2) + u[v]()()()*hp_12;
|
||||
|
||||
|
||||
// if ( ss==0){
|
||||
/*
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
|
||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
|
||||
}
|
||||
*/
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -154,6 +257,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||
|
||||
@ -183,8 +288,8 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
}
|
||||
}
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
||||
{
|
||||
@ -250,13 +355,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
||||
}
|
||||
}
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
// Dynamic allocate on stack to get per thread without serialised heap acces
|
||||
PARALLEL_FOR_LOOP
|
||||
for(auto site=0;site<vol;site++){
|
||||
|
||||
// SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
|
||||
// SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
|
||||
// SiteSpinor *SiteChi =(SiteSpinor *) alloca(LLs*sizeof(SiteSpinor));
|
||||
#pragma omp parallel
|
||||
{
|
||||
|
||||
Vector<SiteHalfSpinor> SitePplus(LLs);
|
||||
Vector<SiteHalfSpinor> SitePminus(LLs);
|
||||
@ -267,6 +370,9 @@ PARALLEL_FOR_LOOP
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
|
||||
#pragma omp for
|
||||
for(auto site=0;site<vol;site++){
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int lex = s+LLs*site;
|
||||
spProj5p(SitePplus[s] ,psi[lex]);
|
||||
@ -294,6 +400,8 @@ PARALLEL_FOR_LOOP
|
||||
chi[lex] = SiteChi[s]*0.5;
|
||||
}
|
||||
}
|
||||
}
|
||||
MooeeInvTime+=usecond();
|
||||
}
|
||||
|
||||
INSTANTIATE_DPERP(DomainWallVec5dImplD);
|
||||
|
@ -48,8 +48,10 @@ namespace QCD {
|
||||
// typedef typename XXX GaugeField;
|
||||
// typedef typename XXX GaugeActField;
|
||||
// typedef typename XXX FermionField;
|
||||
// typedef typename XXX PropagatorField;
|
||||
// typedef typename XXX DoubledGaugeField;
|
||||
// typedef typename XXX SiteSpinor;
|
||||
// typedef typename XXX SitePropagator;
|
||||
// typedef typename XXX SiteHalfSpinor;
|
||||
// typedef typename XXX Compressor;
|
||||
//
|
||||
@ -95,13 +97,15 @@ namespace QCD {
|
||||
|
||||
#define INHERIT_FIMPL_TYPES(Impl)\
|
||||
typedef typename Impl::FermionField FermionField; \
|
||||
typedef typename Impl::PropagatorField PropagatorField; \
|
||||
typedef typename Impl::DoubledGaugeField DoubledGaugeField; \
|
||||
typedef typename Impl::SiteSpinor SiteSpinor; \
|
||||
typedef typename Impl::SitePropagator SitePropagator; \
|
||||
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
|
||||
typedef typename Impl::Compressor Compressor; \
|
||||
typedef typename Impl::StencilImpl StencilImpl; \
|
||||
typedef typename Impl::ImplParams ImplParams; \
|
||||
typedef typename Impl::Coeff_t Coeff_t;
|
||||
typedef typename Impl::ImplParams ImplParams; \
|
||||
typedef typename Impl::Coeff_t Coeff_t; \
|
||||
|
||||
#define INHERIT_IMPL_TYPES(Base) \
|
||||
INHERIT_GIMPL_TYPES(Base) \
|
||||
@ -127,14 +131,17 @@ namespace QCD {
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
||||
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
typedef Lattice<SitePropagator> PropagatorField;
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
|
||||
@ -216,14 +223,17 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
|
||||
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
|
||||
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
typedef Lattice<SitePropagator> PropagatorField;
|
||||
|
||||
// Make the doubled gauge field a *scalar*
|
||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||
@ -315,14 +325,17 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp >;
|
||||
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
||||
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
typedef Lattice<SitePropagator> PropagatorField;
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
|
||||
|
@ -194,6 +194,11 @@ void WilsonFermion5D<Impl>::Report(void)
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
|
||||
RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
|
||||
|
||||
}
|
||||
|
||||
if ( DerivCalls > 0 ) {
|
||||
@ -209,12 +214,15 @@ void WilsonFermion5D<Impl>::Report(void)
|
||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
||||
}
|
||||
|
||||
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; }
|
||||
|
||||
if (DerivCalls > 0 || DhopCalls > 0){
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -61,14 +61,8 @@ public:
|
||||
switch(Opt) {
|
||||
#ifdef AVX512
|
||||
case OptInlineAsm:
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
}
|
||||
break;
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
break;
|
||||
#endif
|
||||
case OptHandUnroll:
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
@ -115,13 +109,7 @@ public:
|
||||
switch(Opt) {
|
||||
#ifdef AVX512
|
||||
case OptInlineAsm:
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
}
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
break;
|
||||
#endif
|
||||
case OptHandUnroll:
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
|
||||
}
|
||||
|
||||
#if defined(AVX512)
|
||||
|
||||
#include <simd/Intel512wilson.h>
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/Intel512wilson.h>
|
||||
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
static Vector<vComplexF> signs;
|
||||
|
||||
int setupSigns(void ){
|
||||
Vector<vComplexF> bother(2);
|
||||
static Vector<vComplexF> signsF;
|
||||
|
||||
template<typename vtype>
|
||||
int setupSigns(Vector<vtype>& signs ){
|
||||
Vector<vtype> bother(2);
|
||||
signs = bother;
|
||||
vrsign(signs[0]);
|
||||
visign(signs[1]);
|
||||
return 1;
|
||||
}
|
||||
static int signInit = setupSigns();
|
||||
|
||||
static int signInitF = setupSigns(signsF);
|
||||
|
||||
#define label(A) ilabel(A)
|
||||
#define ilabel(A) ".globl\n" #A ":\n"
|
||||
@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
#define FX(A) WILSONASM_ ##A
|
||||
#define COMPLEX_TYPE vComplexF
|
||||
#define signs signsF
|
||||
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
|
||||
#undef FX
|
||||
#define FX(A) DWFASM_ ## A
|
||||
#define MAYBEPERM(A,B)
|
||||
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||
//#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
||||
//#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
|
||||
#undef KERNEL_DAG
|
||||
@ -113,8 +118,71 @@ template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
#undef COMPLEX_TYPE
|
||||
#undef signs
|
||||
#undef VMOVRDUP
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#undef FX
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the double precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
|
||||
static Vector<vComplexD> signsD;
|
||||
#define signs signsD
|
||||
static int signInitD = setupSigns(signsD);
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
#define FX(A) WILSONASM_ ##A
|
||||
#define COMPLEX_TYPE vComplexD
|
||||
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#endif
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#undef FX
|
||||
#define FX(A) DWFASM_ ## A
|
||||
#define MAYBEPERM(A,B)
|
||||
//#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C)
|
||||
//#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef COMPLEX_TYPE
|
||||
#undef signs
|
||||
#undef VMOVRDUP
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#undef FX
|
||||
|
||||
#endif //AVX512
|
||||
|
||||
#define INSTANTIATE_ASM(A)\
|
||||
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
|
||||
|
@ -5,7 +5,9 @@
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
vComplexF *isigns = &signs[0];
|
||||
//COMPLEX_TYPE is vComplexF of vComplexD depending
|
||||
//on the chosen precision
|
||||
COMPLEX_TYPE *isigns = &signs[0];
|
||||
|
||||
MASK_REGS;
|
||||
int nmax=U._grid->oSites();
|
||||
|
@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate {
|
||||
NoSmearing<Gimpl> SmearingPolicy;
|
||||
typedef MinimumNorm2<GaugeField, NoSmearing<Gimpl>, RepresentationsPolicy >
|
||||
IntegratorType; // change here to change the algorithm
|
||||
IntegratorParameters MDpar(20, 1.0);
|
||||
IntegratorParameters MDpar(40, 1.0);
|
||||
IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
|
||||
|
||||
// Checkpoint strategy
|
||||
|
Reference in New Issue
Block a user