Build a list of whats on the surface

2025-12-21 21:24:30 +00:00 · 2017-04-24 17:06:15 +01:00
parent 916e9e1d3e
commit 56277a11c8
5 changed files with 101 additions and 13 deletions
--- a/7
+++ b/7
@@ -2,21 +2,20 @@ TODO:
 ---------------

 Peter's work list:
-1)- Half-precision comms                                <-- started -- SIMD is prepared
 2)- Precision conversion and sort out localConvert      <-- 
-
 3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 
 4)- Binary I/O speed up & x-strips
-
 -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
 -- Physical propagator interface
 -- Conserved currents
 -- GaugeFix into central location
-
 -- Multigrid Wilson and DWF, compare to other Multigrid implementations
 -- HDCR resume

 Recent DONE 
+-- Cut down the exterior overhead                      <-- DONE
+-- Interior legs from SHM comms                        <-- DONE
+-- Half-precision comms                                <-- DONE
 -- Merge high precision reduction into develop        
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination
   -- slice* linalg routines for multiRHS, BlockCG    
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -242,6 +242,7 @@ public:
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;

  std::vector<int> same_node;
+  std::vector<int> surface_list;

  WilsonStencil(GridBase *grid,
 		int npoints,
@@ -249,11 +250,33 @@ public:
 		const std::vector<int> &directions,
 		const std::vector<int> &distances)  
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
-      same_node(npoints)
+    same_node(npoints)
  { 
-    assert(npoints==8);// or 10 if do naive DWF 5d red black ?
+    surface_list.resize(0);
  };

+  void BuildSurfaceList(int Ls,int vol4){
+
+    // find same node for SHM
+    // Here we know the distance is 1 for WilsonStencil
+    for(int point=0;point<this->_npoints;point++){
+      same_node[point] = this->SameNode(point);
+      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
+    }
+    
+    for(int site = 0 ;site< vol4;site++){
+      int local = 1;
+      for(int point=0;point<this->_npoints;point++){
+	if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ 
+	  local = 0;
+	}
+      }
+      if(local == 0) { 
+	surface_list.push_back(site);
+      }
+    }
+  }
+
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -117,6 +117,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
+
+  // Build lists of exterior only nodes
+  int LLs = FourDimGrid._rdimensions[0];
+  int vol4;
+  vol4=FourDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4=FourDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+   StencilOdd.BuildSurfaceList(LLs,vol4);
+
+   std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+                       <<" " << StencilEven.surface_list.size()<<std::endl;
+
 }
     
 template<class Impl>
@@ -406,6 +419,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  // Load imbalance alert. Should use dynamic schedule OMP for loop
  // Perhaps create a list of only those sites with face work, and 
  // load balance process the list.
+#if 1
+
 #if 0
 #pragma omp parallel 
  {
@@ -422,6 +437,27 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
    if ( me==0 ) DhopComputeTime2+=usecond();
  }// end parallel region
+#else
+  DhopComputeTime2-=usecond();
+  if (dag == DaggerYes) {
+#pragma omp parallel for schedule(static,1)
+    for (int ss = 0; ss < st.surface_list.size(); ss++) {
+      int sU = st.surface_list[ss];
+      int sF = LLs * sU;
+      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  } else {
+#pragma omp parallel for schedule(static,1)
+    for (int ss = 0; ss < st.surface_list.size(); ss++) {
+      int sU = st.surface_list[ss];
+      int sF = LLs * sU;
+      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
+    }
+  }
+  DhopComputeTime2+=usecond();
+#endif 
+
+
 #else 
 DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -473,12 +473,12 @@ namespace Optimization {
 #define USE_FP16
  struct PrecisionChange {
    static inline __m256i StoH (__m256 a,__m256 b) {
-      __m256 h;
+      __m256i h;
 #ifdef USE_FP16
      __m128i ha = _mm256_cvtps_ph(a,0);
      __m128i hb = _mm256_cvtps_ph(b,0);
-      h = _mm256_castps128_ps256(ha);
-      h = _mm256_insertf128_ps(h,hb,1);
+      h =(__m256i) _mm256_castps128_ps256((__m128)ha);
+      h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
 #else 
      assert(0);
 #endif
@@ -486,8 +486,8 @@ namespace Optimization {
    }
    static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) {
 #ifdef USE_FP16
-      sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
-      sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
+      sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
+      sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
 #else 
      assert(0);
 #endif
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -190,8 +190,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
-  inline int GetNodeLocal(int osite) { 
-    return _entries[_npoints*osite]._node_local;
+  inline int SameNode(int point) { 
+
+    int dimension    = _directions[point];
+    int displacement = _distances[point];
+    assert( (displacement==1) || (displacement==-1));
+
+    int pd              = _grid->_processors[dimension];
+    int fd              = _grid->_fdimensions[dimension];
+    int ld              = _grid->_ldimensions[dimension];
+    int rd              = _grid->_rdimensions[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+
+    int recv_from_rank;
+    int xmit_to_rank;
+
+    if ( ! comm_dim ) return 1;
+
+    int nbr_proc;
+    if (displacement==1) nbr_proc = 1;
+    else                 nbr_proc = pd-1;
+
+    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+
+    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
+
+    if ( shm==NULL ) return 0;
+
+    return 1;
+  }
+  inline int GetNodeLocal(int osite,int point) { 
+    return _entries[point+_npoints*osite]._node_local;
  }
  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { 
    ptype = _permute_type[point]; return & _entries[point+_npoints*osite];