mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-29 19:14:33 +00:00 
			
		
		
		
	Build a list of whats on the surface
This commit is contained in:
		
							
								
								
									
										7
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								TODO
									
									
									
									
									
								
							| @@ -2,21 +2,20 @@ TODO: | |||||||
| --------------- | --------------- | ||||||
|  |  | ||||||
| Peter's work list: | Peter's work list: | ||||||
| 1)- Half-precision comms                                <-- started -- SIMD is prepared |  | ||||||
| 2)- Precision conversion and sort out localConvert      <--  | 2)- Precision conversion and sort out localConvert      <--  | ||||||
|  |  | ||||||
| 3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started  | 3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started  | ||||||
| 4)- Binary I/O speed up & x-strips | 4)- Binary I/O speed up & x-strips | ||||||
|  |  | ||||||
| -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet | -- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet | ||||||
| -- Physical propagator interface | -- Physical propagator interface | ||||||
| -- Conserved currents | -- Conserved currents | ||||||
| -- GaugeFix into central location | -- GaugeFix into central location | ||||||
|  |  | ||||||
| -- Multigrid Wilson and DWF, compare to other Multigrid implementations | -- Multigrid Wilson and DWF, compare to other Multigrid implementations | ||||||
| -- HDCR resume | -- HDCR resume | ||||||
|  |  | ||||||
| Recent DONE  | Recent DONE  | ||||||
|  | -- Cut down the exterior overhead                      <-- DONE | ||||||
|  | -- Interior legs from SHM comms                        <-- DONE | ||||||
|  | -- Half-precision comms                                <-- DONE | ||||||
| -- Merge high precision reduction into develop         | -- Merge high precision reduction into develop         | ||||||
| -- multiRHS DWF; benchmark on Cori/BNL for comms elimination | -- multiRHS DWF; benchmark on Cori/BNL for comms elimination | ||||||
|    -- slice* linalg routines for multiRHS, BlockCG     |    -- slice* linalg routines for multiRHS, BlockCG     | ||||||
|   | |||||||
| @@ -242,6 +242,7 @@ public: | |||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; | ||||||
|  |  | ||||||
|   std::vector<int> same_node; |   std::vector<int> same_node; | ||||||
|  |   std::vector<int> surface_list; | ||||||
|  |  | ||||||
|   WilsonStencil(GridBase *grid, |   WilsonStencil(GridBase *grid, | ||||||
| 		int npoints, | 		int npoints, | ||||||
| @@ -249,11 +250,33 @@ public: | |||||||
| 		const std::vector<int> &directions, | 		const std::vector<int> &directions, | ||||||
| 		const std::vector<int> &distances)   | 		const std::vector<int> &distances)   | ||||||
|     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , |     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , | ||||||
|       same_node(npoints) |     same_node(npoints) | ||||||
|   {  |   {  | ||||||
|     assert(npoints==8);// or 10 if do naive DWF 5d red black ? |     surface_list.resize(0); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   void BuildSurfaceList(int Ls,int vol4){ | ||||||
|  |  | ||||||
|  |     // find same node for SHM | ||||||
|  |     // Here we know the distance is 1 for WilsonStencil | ||||||
|  |     for(int point=0;point<this->_npoints;point++){ | ||||||
|  |       same_node[point] = this->SameNode(point); | ||||||
|  |       std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     for(int site = 0 ;site< vol4;site++){ | ||||||
|  |       int local = 1; | ||||||
|  |       for(int point=0;point<this->_npoints;point++){ | ||||||
|  | 	if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){  | ||||||
|  | 	  local = 0; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       if(local == 0) {  | ||||||
|  | 	surface_list.push_back(site); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   template < class compressor> |   template < class compressor> | ||||||
|   void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)  |   void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|   | |||||||
| @@ -117,6 +117,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu, | |||||||
|      |      | ||||||
|   // Allocate the required comms buffer |   // Allocate the required comms buffer | ||||||
|   ImportGauge(_Umu); |   ImportGauge(_Umu); | ||||||
|  |  | ||||||
|  |   // Build lists of exterior only nodes | ||||||
|  |   int LLs = FourDimGrid._rdimensions[0]; | ||||||
|  |   int vol4; | ||||||
|  |   vol4=FourDimGrid.oSites(); | ||||||
|  |   Stencil.BuildSurfaceList(LLs,vol4); | ||||||
|  |   vol4=FourDimRedBlackGrid.oSites(); | ||||||
|  |   StencilEven.BuildSurfaceList(LLs,vol4); | ||||||
|  |    StencilOdd.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|  |    std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() | ||||||
|  |                        <<" " << StencilEven.surface_list.size()<<std::endl; | ||||||
|  |  | ||||||
| } | } | ||||||
|       |       | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -406,6 +419,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|   // Load imbalance alert. Should use dynamic schedule OMP for loop |   // Load imbalance alert. Should use dynamic schedule OMP for loop | ||||||
|   // Perhaps create a list of only those sites with face work, and  |   // Perhaps create a list of only those sites with face work, and  | ||||||
|   // load balance process the list. |   // load balance process the list. | ||||||
|  | #if 1 | ||||||
|  |  | ||||||
| #if 0 | #if 0 | ||||||
| #pragma omp parallel  | #pragma omp parallel  | ||||||
|   { |   { | ||||||
| @@ -422,6 +437,27 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|     else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); |     else                  Kernels::DhopSite   (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); | ||||||
|     if ( me==0 ) DhopComputeTime2+=usecond(); |     if ( me==0 ) DhopComputeTime2+=usecond(); | ||||||
|   }// end parallel region |   }// end parallel region | ||||||
|  | #else | ||||||
|  |   DhopComputeTime2-=usecond(); | ||||||
|  |   if (dag == DaggerYes) { | ||||||
|  | #pragma omp parallel for schedule(static,1) | ||||||
|  |     for (int ss = 0; ss < st.surface_list.size(); ss++) { | ||||||
|  |       int sU = st.surface_list[ss]; | ||||||
|  |       int sF = LLs * sU; | ||||||
|  |       Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  | #pragma omp parallel for schedule(static,1) | ||||||
|  |     for (int ss = 0; ss < st.surface_list.size(); ss++) { | ||||||
|  |       int sU = st.surface_list[ss]; | ||||||
|  |       int sF = LLs * sU; | ||||||
|  |       Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopComputeTime2+=usecond(); | ||||||
|  | #endif  | ||||||
|  |  | ||||||
|  |  | ||||||
| #else  | #else  | ||||||
| DhopComputeTime2-=usecond(); | DhopComputeTime2-=usecond(); | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|   | |||||||
| @@ -473,12 +473,12 @@ namespace Optimization { | |||||||
| #define USE_FP16 | #define USE_FP16 | ||||||
|   struct PrecisionChange { |   struct PrecisionChange { | ||||||
|     static inline __m256i StoH (__m256 a,__m256 b) { |     static inline __m256i StoH (__m256 a,__m256 b) { | ||||||
|       __m256 h; |       __m256i h; | ||||||
| #ifdef USE_FP16 | #ifdef USE_FP16 | ||||||
|       __m128i ha = _mm256_cvtps_ph(a,0); |       __m128i ha = _mm256_cvtps_ph(a,0); | ||||||
|       __m128i hb = _mm256_cvtps_ph(b,0); |       __m128i hb = _mm256_cvtps_ph(b,0); | ||||||
|       h = _mm256_castps128_ps256(ha); |       h =(__m256i) _mm256_castps128_ps256((__m128)ha); | ||||||
|       h = _mm256_insertf128_ps(h,hb,1); |       h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1); | ||||||
| #else  | #else  | ||||||
|       assert(0); |       assert(0); | ||||||
| #endif | #endif | ||||||
| @@ -486,8 +486,8 @@ namespace Optimization { | |||||||
|     } |     } | ||||||
|     static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) { |     static inline void  HtoS (__m256i h,__m256 &sa,__m256 &sb) { | ||||||
| #ifdef USE_FP16 | #ifdef USE_FP16 | ||||||
|       sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); |       sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0)); | ||||||
|       sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); |       sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1)); | ||||||
| #else  | #else  | ||||||
|       assert(0); |       assert(0); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -190,8 +190,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Stencil query |   // Stencil query | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   inline int GetNodeLocal(int osite) {  |   inline int SameNode(int point) {  | ||||||
|     return _entries[_npoints*osite]._node_local; |  | ||||||
|  |     int dimension    = _directions[point]; | ||||||
|  |     int displacement = _distances[point]; | ||||||
|  |     assert( (displacement==1) || (displacement==-1)); | ||||||
|  |  | ||||||
|  |     int pd              = _grid->_processors[dimension]; | ||||||
|  |     int fd              = _grid->_fdimensions[dimension]; | ||||||
|  |     int ld              = _grid->_ldimensions[dimension]; | ||||||
|  |     int rd              = _grid->_rdimensions[dimension]; | ||||||
|  |     int simd_layout     = _grid->_simd_layout[dimension]; | ||||||
|  |     int comm_dim        = _grid->_processors[dimension] >1 ; | ||||||
|  |  | ||||||
|  |     int recv_from_rank; | ||||||
|  |     int xmit_to_rank; | ||||||
|  |  | ||||||
|  |     if ( ! comm_dim ) return 1; | ||||||
|  |  | ||||||
|  |     int nbr_proc; | ||||||
|  |     if (displacement==1) nbr_proc = 1; | ||||||
|  |     else                 nbr_proc = pd-1; | ||||||
|  |  | ||||||
|  |     _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
|  |     void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); | ||||||
|  |  | ||||||
|  |     if ( shm==NULL ) return 0; | ||||||
|  |  | ||||||
|  |     return 1; | ||||||
|  |   } | ||||||
|  |   inline int GetNodeLocal(int osite,int point) {  | ||||||
|  |     return _entries[point+_npoints*osite]._node_local; | ||||||
|   } |   } | ||||||
|   inline StencilEntry * GetEntry(int &ptype,int point,int osite) {  |   inline StencilEntry * GetEntry(int &ptype,int point,int osite) {  | ||||||
|     ptype = _permute_type[point]; return & _entries[point+_npoints*osite];  |     ptype = _permute_type[point]; return & _entries[point+_npoints*osite];  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user