Merge branch 'develop' of https://github.com/paboyle/Grid into feature/rare_kaon

# Conflicts: # lib/qcd/action/fermion/WilsonFermion5D.cc # tests/hadrons/Test_hadrons_rarekaon.cc
2025-08-04 05:37:06 +01:00 · 2017-05-04 16:22:33 +01:00
parent 49331a3e72 679ae98b14
commit ca1077c560
109 changed files with 4537 additions and 3328 deletions
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -117,49 +117,20 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
+
+  // Build lists of exterior only nodes
+  int LLs = FiveDimGrid._rdimensions[0];
+  int vol4;
+  vol4=FourDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4=FourDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+   StencilOdd.BuildSurfaceList(LLs,vol4);
+
+  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+                       <<" " << StencilEven.surface_list.size()<<std::endl;
+
 }
-  /*
-template<class Impl>
-WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
-               GridCartesian         &FiveDimGrid,
-               GridRedBlackCartesian &FiveDimRedBlackGrid,
-               GridCartesian         &FourDimGrid,
-               RealD _M5,const ImplParams &p) :
-{
-  int nsimd = Simd::Nsimd();
-
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
-  assert(FourDimGrid._ndimension==4);
-
-  // Dimension zero of the five-d is the Ls direction
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
-
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
-
-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
-
-    assert(FourDimGrid._simd_layout[d]=1);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
-
-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
-  }
-
-  {
-  }
-}  
-  */
     
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
@@ -396,6 +367,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
  DhopTotalTime+=usecond();
 }

+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
 							DoubledGaugeField & U,
@@ -409,12 +381,21 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg

  int LLs = in._grid->_rdimensions[0];
  int len =  U._grid->oSites();
-  
+
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
  DhopFaceTime+=usecond();
  std::vector<std::vector<CommsRequest_t> > reqs;

+  // Rely on async comms; start comms before merge of local data
+  DhopCommTime-=usecond();
+  st.CommunicateBegin(reqs);
+
+  DhopFaceTime-=usecond();
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime+=usecond();
+
+  // Perhaps use omp task and region
 #pragma omp parallel 
  { 
    int nthreads = omp_get_num_threads();
@@ -425,8 +406,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    int sF = LLs * myoff;

    if ( me == 0 ) {
-      DhopCommTime-=usecond();
-      st.CommunicateBegin(reqs);
      st.CommunicateComplete(reqs);
      DhopCommTime+=usecond();
    } else { 
@@ -439,28 +418,37 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  }

  DhopFaceTime-=usecond();
-  st.CommsMerge();
+  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

-#pragma omp parallel 
-  {
-    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
-    int myoff, mywork;
-
-    GridThread::GetWork(len,me,mywork,myoff,nthreads);
-    int sF = LLs * myoff;
-
-    // Exterior links in stencil
-    if ( me==0 ) DhopComputeTime2-=usecond();
-    if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
-    else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
-    if ( me==0 ) DhopComputeTime2+=usecond();
-  }// end parallel region
+  // Load imbalance alert. Should use dynamic schedule OMP for loop
+  // Perhaps create a list of only those sites with face work, and 
+  // load balance process the list.
+  DhopComputeTime2-=usecond();
+  if (dag == DaggerYes) {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      int sF = LLs * sU;
+      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  } else {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      int sF = LLs * sU;
+      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  }
+  DhopComputeTime2+=usecond();
 #else 
  assert(0);
 #endif
+
 }
+
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,