From b722889234a02200bbafde2a73e8d43b5c9a5282 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 22 Apr 2017 19:27:41 +0100 Subject: [PATCH] Try a better load balancing loop --- lib/qcd/action/fermion/WilsonFermion5D.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 9d325ed5..daddb605 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -406,6 +406,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. +#if 1 #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -421,9 +422,28 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); if ( me==0 ) DhopComputeTime2+=usecond(); }// end parallel region +#else +DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } else { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); + } + } +DhopComputeTime2+=usecond(); +#endif + #else assert(0); #endif + } template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,