Merge branch 'develop' into feature/hmc_generalise

2025-10-27 10:09:33 +00:00 · 2017-05-01 12:13:56 +01:00
parent 8c540333d5 99220f6531
commit 3344788fa1
69 changed files with 3971 additions and 3179 deletions
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -118,6 +118,18 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
+  // Build lists of exterior only nodes
+  int LLs = FiveDimGrid._rdimensions[0];
+  int vol4;
+  vol4=FourDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4=FourDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+   StencilOdd.BuildSurfaceList(LLs,vol4);
+
+  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+                       <<" " << StencilEven.surface_list.size()<<std::endl;
+
 }
     
 template<class Impl>
@@ -359,6 +371,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
  DhopTotalTime+=usecond();
 }

+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
 							DoubledGaugeField & U,
@@ -372,12 +385,21 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg

  int LLs = in._grid->_rdimensions[0];
  int len =  U._grid->oSites();
-  
+
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
  DhopFaceTime+=usecond();
  std::vector<std::vector<CommsRequest_t> > reqs;

+  // Rely on async comms; start comms before merge of local data
+  DhopCommTime-=usecond();
+  st.CommunicateBegin(reqs);
+
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime+=usecond();
+
+  // Perhaps use omp task and region
 #pragma omp parallel 
  { 
    int nthreads = omp_get_num_threads();
@@ -388,8 +410,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    int sF = LLs * myoff;

    if ( me == 0 ) {
-      DhopCommTime-=usecond();
-      st.CommunicateBegin(reqs);
      st.CommunicateComplete(reqs);
      DhopCommTime+=usecond();
    } else { 
@@ -402,28 +422,37 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  }

  DhopFaceTime-=usecond();
-  st.CommsMerge();
+  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

-#pragma omp parallel 
-  {
-    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
-    int myoff, mywork;
-
-    GridThread::GetWork(len,me,mywork,myoff,nthreads);
-    int sF = LLs * myoff;
-
-    // Exterior links in stencil
-    if ( me==0 ) DhopComputeTime2-=usecond();
-    if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
-    else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
-    if ( me==0 ) DhopComputeTime2+=usecond();
-  }// end parallel region
+  // Load imbalance alert. Should use dynamic schedule OMP for loop
+  // Perhaps create a list of only those sites with face work, and 
+  // load balance process the list.
+  DhopComputeTime2-=usecond();
+  if (dag == DaggerYes) {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      int sF = LLs * sU;
+      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  } else {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      int sF = LLs * sU;
+      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  }
+  DhopComputeTime2+=usecond();
 #else 
  assert(0);
 #endif
+
 }
+
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
@@ -642,7 +671,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe

 }

-
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);