From b722889234a02200bbafde2a73e8d43b5c9a5282 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 22 Apr 2017 19:27:41 +0100
Subject: [PATCH] Try a better load balancing loop

---
 lib/qcd/action/fermion/WilsonFermion5D.cc | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 9d325ed5..daddb605 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -406,6 +406,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   // Load imbalance alert. Should use dynamic schedule OMP for loop
   // Perhaps create a list of only those sites with face work, and 
   // load balance process the list.
+#if 1
 #pragma omp parallel 
   {
     int nthreads = omp_get_num_threads();
@@ -421,9 +422,28 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
     else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
     if ( me==0 ) DhopComputeTime2+=usecond();
   }// end parallel region
+#else 
+DhopComputeTime2-=usecond();
+  if (dag == DaggerYes) {
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  } else {
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  }
+DhopComputeTime2+=usecond();
+#endif
+
 #else 
   assert(0);
 #endif
+
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,