1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-10 19:36:56 +01:00

Global changes to parallel_for structure.

Move the comms flags to more sensible names
This commit is contained in:
paboyle
2017-02-21 05:24:27 -05:00
parent 3906cd2149
commit 3ae92fa2e6
43 changed files with 271 additions and 513 deletions

View File

@ -54,8 +54,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
for(int s=0;s<Ls;s++){
auto tmp = psi._odata[0];
if ( s==0 ) {
@ -98,8 +98,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
for(int s=0;s<Ls;s++){
if ( s==0 ) {
@ -137,8 +137,7 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
@ -184,8 +183,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];

View File

@ -91,8 +91,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
assert(Nc==3);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
@ -232,8 +231,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
@ -792,13 +790,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
MooeeInvTime-=usecond();
if ( switcheroo<Coeff_t>::iscomplex() ) {
PARALLEL_FOR_LOOP
for(auto site=0;site<vol;site++){
parallel_for(auto site=0;site<vol;site++){
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}
} else {
PARALLEL_FOR_LOOP
for(auto site=0;site<vol;site++){
parallel_for(auto site=0;site<vol;site++){
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}
}

View File

@ -194,8 +194,7 @@ namespace QCD {
GaugeLinkField tmp(mat._grid);
tmp = zero;
PARALLEL_FOR_LOOP
for(int sss=0;sss<tmp._grid->oSites();sss++){
parallel_for(int sss=0;sss<tmp._grid->oSites();sss++){
int sU=sss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
@ -445,8 +444,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
Uconj = where(coor==neglink,-Uconj,Uconj);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
parallel_for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu) = U[ss]();
Uds[ss](1)(mu) = Uconj[ss]();
}
@ -459,8 +457,7 @@ PARALLEL_FOR_LOOP
Utmp = where(coor==0,Uconj,Utmp);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
parallel_for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu+4) = Utmp[ss]();
}
@ -469,8 +466,7 @@ PARALLEL_FOR_LOOP
Utmp = where(coor==0,U,Utmp);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
parallel_for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](1)(mu+4) = Utmp[ss]();
}
@ -484,8 +480,7 @@ PARALLEL_FOR_LOOP
GaugeLinkField link(mat._grid);
// use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
PARALLEL_FOR_LOOP
for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
}
PokeIndex<LorentzIndex>(mat, link, mu);
@ -498,8 +493,7 @@ PARALLEL_FOR_LOOP
GaugeLinkField tmp(mat._grid);
tmp = zero;
PARALLEL_FOR_LOOP
for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
parallel_for(int ss = 0; ss < tmp._grid->oSites(); ss++) {
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));

View File

@ -222,8 +222,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
////////////////////////
// Call the single hop
////////////////////////
PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
gamma);
}
@ -333,8 +332,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
Stencil.HaloExchange(in, compressor);
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
}
};
@ -350,13 +348,11 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
st.HaloExchange(in, compressor);
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
}

View File

@ -275,8 +275,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
assert(dirdisp<=7);
assert(dirdisp>=0);
PARALLEL_FOR_LOOP
for(int ss=0;ss<Umu._grid->oSites();ss++){
parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
@ -323,8 +322,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
////////////////////////
DerivDhopComputeTime -= usecond();
PARALLEL_FOR_LOOP
for (int sss = 0; sss < U._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
for (int s = 0; s < Ls; s++) {
int sU = sss;
int sF = s + Ls * sU;
@ -493,73 +491,18 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
} else {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
}
/*
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
#ifdef AVX512_SWITCHOFF
} else if (stat.is_init() ) {
int nthreads;
stat.start();
#pragma omp parallel
{
#pragma omp master
nthreads = omp_get_num_threads();
int mythread = omp_get_thread_num();
stat.enter(mythread);
#pragma omp for nowait
for(int ss=0;ss<U._grid->oSites();ss++) {
int sU=ss;
int sF=LLs*sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
stat.exit(mythread);
}
stat.accum(nthreads);
#endif
} else {
#if 1
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
#else
#ifdef GRID_OMP
#pragma omp parallel
#endif
{
int len = U._grid->oSites();
int me, myoff,mywork;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
int sF = LLs * myoff;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out);
}
#endif
}
*/
DhopComputeTime+=usecond();
}

View File

@ -66,8 +66,7 @@ public:
// Move this elsewhere? FIXME
static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
int mu) { // U[mu] += W
PARALLEL_FOR_LOOP
for (auto ss = 0; ss < U._grid->oSites(); ss++) {
parallel_for (auto ss = 0; ss < U._grid->oSites(); ss++) {
U._odata[ss]._internal[mu] =
U._odata[ss]._internal[mu] + W._odata[ss]._internal;
}

View File

@ -48,8 +48,7 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
GridBase *grid=x._grid;
Gamma G5(Gamma::Gamma5);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss++){
parallel_for(int ss=0;ss<grid->oSites();ss++){
vobj tmp;
tmp = a*x._odata[ss];
tmp = tmp + G5*(b*timesI(x._odata[ss]));
@ -65,8 +64,7 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
conformable(x,z);
GridBase *grid=x._grid;
int Ls = grid->_rdimensions[0];
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
vstream(z._odata[ss+s],tmp);
}
@ -81,8 +79,7 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
GridBase *grid=x._grid;
int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Gamma5);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp;
tmp = G5*x._odata[ss+s]*a;
tmp = tmp + b*y._odata[ss+sp];
@ -99,8 +96,7 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
GridBase *grid=x._grid;
int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Gamma5);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp;
tmp = G5*y._odata[ss+sp]*b;
tmp = tmp + a*x._odata[ss+s];
@ -117,8 +113,7 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
GridBase *grid=x._grid;
int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Gamma5);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp1;
vobj tmp2;
tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
@ -135,8 +130,7 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
conformable(x,z);
GridBase *grid=x._grid;
int Ls = grid->_rdimensions[0];
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp;
spProj5m(tmp,y._odata[ss+sp]);
tmp = a*x._odata[ss+s]+b*tmp;
@ -152,8 +146,7 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
conformable(x,z);
GridBase *grid=x._grid;
int Ls = grid->_rdimensions[0];
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp;
spProj5p(tmp,y._odata[ss+sp]);
tmp = a*x._odata[ss+s]+b*tmp;
@ -169,8 +162,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
conformable(x,z);
int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Gamma5);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
vobj tmp;
for(int s=0;s<Ls;s++){
int sp = Ls-1-s;

View File

@ -221,8 +221,7 @@ class SU {
int i0, i1;
su2SubGroupIndex(i0, i1, su2_index);
PARALLEL_FOR_LOOP
for (int ss = 0; ss < grid->oSites(); ss++) {
parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
@ -252,8 +251,7 @@ class SU {
su2SubGroupIndex(i0, i1, su2_index);
dest = 1.0; // start out with identity
PARALLEL_FOR_LOOP
for (int ss = 0; ss < grid->oSites(); ss++) {
parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);