Build a list of whats on the surface

2025-12-20 04:34:29 +00:00 · 2017-04-24 17:06:15 +01:00
parent 916e9e1d3e
commit 56277a11c8
5 changed files with 101 additions and 13 deletions
--- a/7
+++ b/7
@@ -2,21 +2,20 @@ TODO:
 ---------------
 Peter's work list:
 1)- Half-precision comms                                <-- started -- SIMD is prepared
 2)- Precision conversion and sort out localConvert      <-- 
 3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 
 4)- Binary I/O speed up & x-strips
 -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
 -- Physical propagator interface
 -- Conserved currents
 -- GaugeFix into central location
 -- Multigrid Wilson and DWF, compare to other Multigrid implementations
 -- HDCR resume
 Recent DONE 
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
 -- Merge high precision reduction into develop        
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination
   -- slice* linalg routines for multiRHS, BlockCG    
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -242,6 +242,7 @@ public:
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  std::vector<int> same_node;
  std::vector<int> surface_list;
  WilsonStencil(GridBase *grid,
 		int npoints,
@@ -249,11 +250,33 @@ public:
 		const std::vector<int> &directions,
 		const std::vector<int> &distances)  
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
-      same_node(npoints)
+    same_node(npoints)
  { 
-    assert(npoints==8);// or 10 if do naive DWF 5d red black ?
+    surface_list.resize(0);
  };
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      same_node[point] = this->SameNode(point);
      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
    }
    for(int site = 0 ;site< vol4;site++){
      int local = 1;
      for(int point=0;point<this->_npoints;point++){
 	if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ 
 	  local = 0;
 	}
      }
      if(local == 0) { 
 	surface_list.push_back(site);
      }
    }
  }
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -117,6 +117,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  // Build lists of exterior only nodes
  int LLs = FourDimGrid._rdimensions[0];
  int vol4;
  vol4=FourDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4=FourDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
   std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
@@ -406,6 +419,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  // Load imbalance alert. Should use dynamic schedule OMP for loop
  // Perhaps create a list of only those sites with face work, and 
  // load balance process the list.
 #if 1
 #if 0
 #pragma omp parallel 
  {
@@ -422,6 +437,27 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
    else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
    if ( me==0 ) DhopComputeTime2+=usecond();
  }// end parallel region
 #else
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
 #pragma omp parallel for schedule(static,1)
    for (int ss = 0; ss < st.surface_list.size(); ss++) {
      int sU = st.surface_list[ss];
      int sF = LLs * sU;
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
    }
  } else {
 #pragma omp parallel for schedule(static,1)
    for (int ss = 0; ss < st.surface_list.size(); ss++) {
      int sU = st.surface_list[ss];
      int sF = LLs * sU;
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
    }
  }
  DhopComputeTime2+=usecond();
 #endif 
 #else 
 DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -473,12 +473,12 @@ namespace Optimization {
 #define USE_FP16
  struct PrecisionChange {
    static inline __m256i StoH (__m256 a,__m256 b) {
-      __m256 h;
+      __m256i h;
 #ifdef USE_FP16
      __m128i ha = _mm256_cvtps_ph(a,0);
      __m128i hb = _mm256_cvtps_ph(b,0);
-      h = _mm256_castps128_ps256(ha);
+      h =(__m256i) _mm256_castps128_ps256((__m128)ha);
-      h = _mm256_insertf128_ps(h,hb,1);
+      h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
 #else 
      assert(0);
 #endif
@@ -486,8 +486,8 @@ namespace Optimization {
    }
    static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) {
 #ifdef USE_FP16
-      sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
+      sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
-      sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
+      sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
 #else 
      assert(0);
 #endif
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -190,8 +190,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
-  inline int GetNodeLocal(int osite) { 
+  inline int SameNode(int point) { 
-    return _entries[_npoints*osite]._node_local;
+
    int dimension    = _directions[point];
    int displacement = _distances[point];
    assert( (displacement==1) || (displacement==-1));
    int pd              = _grid->_processors[dimension];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
    int recv_from_rank;
    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    int nbr_proc;
    if (displacement==1) nbr_proc = 1;
    else                 nbr_proc = pd-1;
    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
    if ( shm==NULL ) return 0;
    return 1;
  }
  inline int GetNodeLocal(int osite,int point) { 
    return _entries[point+_npoints*osite]._node_local;
  }
  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { 
    ptype = _permute_type[point]; return & _entries[point+_npoints*osite];