1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Global changes to parallel_for structure.

Move the comms flags to more sensible names
This commit is contained in:
paboyle
2017-02-21 05:24:27 -05:00
parent 3906cd2149
commit 3ae92fa2e6
43 changed files with 271 additions and 513 deletions

View File

@ -54,8 +54,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
for(int s=0;s<Ls;s++){
auto tmp = psi._odata[0];
if ( s==0 ) {
@ -98,8 +98,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
for(int s=0;s<Ls;s++){
if ( s==0 ) {
@ -137,8 +137,7 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
@ -184,8 +183,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];

View File

@ -91,8 +91,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
assert(Nc==3);
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
@ -232,8 +231,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
@ -792,13 +790,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
MooeeInvTime-=usecond();
if ( switcheroo<Coeff_t>::iscomplex() ) {
PARALLEL_FOR_LOOP
for(auto site=0;site<vol;site++){
parallel_for(auto site=0;site<vol;site++){
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}
} else {
PARALLEL_FOR_LOOP
for(auto site=0;site<vol;site++){
parallel_for(auto site=0;site<vol;site++){
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}
}

View File

@ -194,8 +194,7 @@ namespace QCD {
GaugeLinkField tmp(mat._grid);
tmp = zero;
PARALLEL_FOR_LOOP
for(int sss=0;sss<tmp._grid->oSites();sss++){
parallel_for(int sss=0;sss<tmp._grid->oSites();sss++){
int sU=sss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
@ -445,8 +444,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
Uconj = where(coor==neglink,-Uconj,Uconj);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
parallel_for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu) = U[ss]();
Uds[ss](1)(mu) = Uconj[ss]();
}
@ -459,8 +457,7 @@ PARALLEL_FOR_LOOP
Utmp = where(coor==0,Uconj,Utmp);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
parallel_for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu+4) = Utmp[ss]();
}
@ -469,8 +466,7 @@ PARALLEL_FOR_LOOP
Utmp = where(coor==0,U,Utmp);
}
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
parallel_for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](1)(mu+4) = Utmp[ss]();
}
@ -484,8 +480,7 @@ PARALLEL_FOR_LOOP
GaugeLinkField link(mat._grid);
// use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
PARALLEL_FOR_LOOP
for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
}
PokeIndex<LorentzIndex>(mat, link, mu);
@ -498,8 +493,7 @@ PARALLEL_FOR_LOOP
GaugeLinkField tmp(mat._grid);
tmp = zero;
PARALLEL_FOR_LOOP
for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
parallel_for(int ss = 0; ss < tmp._grid->oSites(); ss++) {
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));

View File

@ -222,8 +222,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
////////////////////////
// Call the single hop
////////////////////////
PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
gamma);
}
@ -333,8 +332,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
Stencil.HaloExchange(in, compressor);
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
}
};
@ -350,13 +348,11 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
st.HaloExchange(in, compressor);
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
}

View File

@ -275,8 +275,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
assert(dirdisp<=7);
assert(dirdisp>=0);
PARALLEL_FOR_LOOP
for(int ss=0;ss<Umu._grid->oSites();ss++){
parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
@ -323,8 +322,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
////////////////////////
DerivDhopComputeTime -= usecond();
PARALLEL_FOR_LOOP
for (int sss = 0; sss < U._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
for (int s = 0; s < Ls; s++) {
int sU = sss;
int sF = s + Ls * sU;
@ -493,73 +491,18 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
} else {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
}
/*
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
#ifdef AVX512_SWITCHOFF
} else if (stat.is_init() ) {
int nthreads;
stat.start();
#pragma omp parallel
{
#pragma omp master
nthreads = omp_get_num_threads();
int mythread = omp_get_thread_num();
stat.enter(mythread);
#pragma omp for nowait
for(int ss=0;ss<U._grid->oSites();ss++) {
int sU=ss;
int sF=LLs*sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
stat.exit(mythread);
}
stat.accum(nthreads);
#endif
} else {
#if 1
PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
#else
#ifdef GRID_OMP
#pragma omp parallel
#endif
{
int len = U._grid->oSites();
int me, myoff,mywork;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
int sF = LLs * myoff;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out);
}
#endif
}
*/
DhopComputeTime+=usecond();
}