mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Build a list of whats on the surface
This commit is contained in:
		
							
								
								
									
										7
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								TODO
									
									
									
									
									
								
							@@ -2,21 +2,20 @@ TODO:
 | 
			
		||||
---------------
 | 
			
		||||
 | 
			
		||||
Peter's work list:
 | 
			
		||||
1)- Half-precision comms                                <-- started -- SIMD is prepared
 | 
			
		||||
2)- Precision conversion and sort out localConvert      <-- 
 | 
			
		||||
 | 
			
		||||
3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 
 | 
			
		||||
4)- Binary I/O speed up & x-strips
 | 
			
		||||
 | 
			
		||||
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
 | 
			
		||||
-- Physical propagator interface
 | 
			
		||||
-- Conserved currents
 | 
			
		||||
-- GaugeFix into central location
 | 
			
		||||
 | 
			
		||||
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
-- HDCR resume
 | 
			
		||||
 | 
			
		||||
Recent DONE 
 | 
			
		||||
-- Cut down the exterior overhead                      <-- DONE
 | 
			
		||||
-- Interior legs from SHM comms                        <-- DONE
 | 
			
		||||
-- Half-precision comms                                <-- DONE
 | 
			
		||||
-- Merge high precision reduction into develop        
 | 
			
		||||
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
 | 
			
		||||
   -- slice* linalg routines for multiRHS, BlockCG    
 | 
			
		||||
 
 | 
			
		||||
@@ -242,6 +242,7 @@ public:
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> same_node;
 | 
			
		||||
  std::vector<int> surface_list;
 | 
			
		||||
 | 
			
		||||
  WilsonStencil(GridBase *grid,
 | 
			
		||||
		int npoints,
 | 
			
		||||
@@ -249,11 +250,33 @@ public:
 | 
			
		||||
		const std::vector<int> &directions,
 | 
			
		||||
		const std::vector<int> &distances)  
 | 
			
		||||
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
 | 
			
		||||
      same_node(npoints)
 | 
			
		||||
    same_node(npoints)
 | 
			
		||||
  { 
 | 
			
		||||
    assert(npoints==8);// or 10 if do naive DWF 5d red black ?
 | 
			
		||||
    surface_list.resize(0);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void BuildSurfaceList(int Ls,int vol4){
 | 
			
		||||
 | 
			
		||||
    // find same node for SHM
 | 
			
		||||
    // Here we know the distance is 1 for WilsonStencil
 | 
			
		||||
    for(int point=0;point<this->_npoints;point++){
 | 
			
		||||
      same_node[point] = this->SameNode(point);
 | 
			
		||||
      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    for(int site = 0 ;site< vol4;site++){
 | 
			
		||||
      int local = 1;
 | 
			
		||||
      for(int point=0;point<this->_npoints;point++){
 | 
			
		||||
	if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ 
 | 
			
		||||
	  local = 0;
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      if(local == 0) { 
 | 
			
		||||
	surface_list.push_back(site);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template < class compressor>
 | 
			
		||||
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
 
 | 
			
		||||
@@ -117,6 +117,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
    
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
 | 
			
		||||
  // Build lists of exterior only nodes
 | 
			
		||||
  int LLs = FourDimGrid._rdimensions[0];
 | 
			
		||||
  int vol4;
 | 
			
		||||
  vol4=FourDimGrid.oSites();
 | 
			
		||||
  Stencil.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
  vol4=FourDimRedBlackGrid.oSites();
 | 
			
		||||
  StencilEven.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
   StencilOdd.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
 | 
			
		||||
   std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
 | 
			
		||||
                       <<" " << StencilEven.surface_list.size()<<std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
     
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -406,6 +419,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
  // Load imbalance alert. Should use dynamic schedule OMP for loop
 | 
			
		||||
  // Perhaps create a list of only those sites with face work, and 
 | 
			
		||||
  // load balance process the list.
 | 
			
		||||
#if 1
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
@@ -422,6 +437,27 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
    else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
 | 
			
		||||
    if ( me==0 ) DhopComputeTime2+=usecond();
 | 
			
		||||
  }// end parallel region
 | 
			
		||||
#else
 | 
			
		||||
  DhopComputeTime2-=usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
#pragma omp parallel for schedule(static,1)
 | 
			
		||||
    for (int ss = 0; ss < st.surface_list.size(); ss++) {
 | 
			
		||||
      int sU = st.surface_list[ss];
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
#pragma omp parallel for schedule(static,1)
 | 
			
		||||
    for (int ss = 0; ss < st.surface_list.size(); ss++) {
 | 
			
		||||
      int sU = st.surface_list[ss];
 | 
			
		||||
      int sF = LLs * sU;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime2+=usecond();
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#else 
 | 
			
		||||
DhopComputeTime2-=usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
 
 | 
			
		||||
@@ -473,12 +473,12 @@ namespace Optimization {
 | 
			
		||||
#define USE_FP16
 | 
			
		||||
  struct PrecisionChange {
 | 
			
		||||
    static inline __m256i StoH (__m256 a,__m256 b) {
 | 
			
		||||
      __m256 h;
 | 
			
		||||
      __m256i h;
 | 
			
		||||
#ifdef USE_FP16
 | 
			
		||||
      __m128i ha = _mm256_cvtps_ph(a,0);
 | 
			
		||||
      __m128i hb = _mm256_cvtps_ph(b,0);
 | 
			
		||||
      h = _mm256_castps128_ps256(ha);
 | 
			
		||||
      h = _mm256_insertf128_ps(h,hb,1);
 | 
			
		||||
      h =(__m256i) _mm256_castps128_ps256((__m128)ha);
 | 
			
		||||
      h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
 | 
			
		||||
#else 
 | 
			
		||||
      assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
@@ -486,8 +486,8 @@ namespace Optimization {
 | 
			
		||||
    }
 | 
			
		||||
    static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) {
 | 
			
		||||
#ifdef USE_FP16
 | 
			
		||||
      sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
 | 
			
		||||
      sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
 | 
			
		||||
      sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
 | 
			
		||||
      sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
 | 
			
		||||
#else 
 | 
			
		||||
      assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -190,8 +190,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  // Stencil query
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  inline int GetNodeLocal(int osite) { 
 | 
			
		||||
    return _entries[_npoints*osite]._node_local;
 | 
			
		||||
  inline int SameNode(int point) { 
 | 
			
		||||
 | 
			
		||||
    int dimension    = _directions[point];
 | 
			
		||||
    int displacement = _distances[point];
 | 
			
		||||
    assert( (displacement==1) || (displacement==-1));
 | 
			
		||||
 | 
			
		||||
    int pd              = _grid->_processors[dimension];
 | 
			
		||||
    int fd              = _grid->_fdimensions[dimension];
 | 
			
		||||
    int ld              = _grid->_ldimensions[dimension];
 | 
			
		||||
    int rd              = _grid->_rdimensions[dimension];
 | 
			
		||||
    int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
    int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
 | 
			
		||||
    int recv_from_rank;
 | 
			
		||||
    int xmit_to_rank;
 | 
			
		||||
 | 
			
		||||
    if ( ! comm_dim ) return 1;
 | 
			
		||||
 | 
			
		||||
    int nbr_proc;
 | 
			
		||||
    if (displacement==1) nbr_proc = 1;
 | 
			
		||||
    else                 nbr_proc = pd-1;
 | 
			
		||||
 | 
			
		||||
    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
			
		||||
 | 
			
		||||
    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
 | 
			
		||||
 | 
			
		||||
    if ( shm==NULL ) return 0;
 | 
			
		||||
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  inline int GetNodeLocal(int osite,int point) { 
 | 
			
		||||
    return _entries[point+_npoints*osite]._node_local;
 | 
			
		||||
  }
 | 
			
		||||
  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { 
 | 
			
		||||
    ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; 
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user