mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Changing accelerator loop. Still have work to do for multi-GPU code
This commit is contained in:
		| @@ -68,29 +68,14 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice | |||||||
|   int num=table.size(); |   int num=table.size(); | ||||||
|   std::pair<int,int> *table_v = & table[0]; |   std::pair<int,int> *table_v = & table[0]; | ||||||
|   auto rhs_v = rhs.View(); |   auto rhs_v = rhs.View(); | ||||||
| #ifdef GRID_NVCC |   accelerator_forNB( i,num, vobj::Nsimd(), { | ||||||
|   typedef typename vobj::scalar_type scalar_type; |  | ||||||
|   typedef typename vobj::vector_type vector_type; |  | ||||||
|   constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); |  | ||||||
|   accelerator_loopNB( ii,num*Nsimd, { |  | ||||||
|  |  | ||||||
|     typedef decltype(coalescedRead(buffer[0])) compressed_t; |     typedef decltype(coalescedRead(buffer[0])) compressed_t; | ||||||
|       typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t; |  | ||||||
|  |  | ||||||
|       int i = ii/Nsimd; |  | ||||||
|     compressed_t   tmp_c; |     compressed_t   tmp_c; | ||||||
|       uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]); |  | ||||||
|     uint64_t o = table_v[i].first; |     uint64_t o = table_v[i].first; | ||||||
|       compress.Compress(&tmp_c,0,tmp_uc); |     compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); | ||||||
|     coalescedWrite(buffer[off+o],tmp_c); |     coalescedWrite(buffer[off+o],tmp_c); | ||||||
|   }); |   }); | ||||||
| #else | // Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table | ||||||
|   accelerator_loopN( i,num, { |  | ||||||
|       compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]); |  | ||||||
|   }); |  | ||||||
| #endif |  | ||||||
| // Further optimisatoin: i) streaming store the result |  | ||||||
| //                       ii) software prefetch the first element of the next table entry |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| @@ -110,9 +95,14 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic | |||||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  |   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane  | ||||||
|  |  | ||||||
|   auto rhs_v = rhs.View(); |   auto rhs_v = rhs.View(); | ||||||
|   accelerator_loopN( j,num, { |   thread_for(j, num, { | ||||||
|     compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs_v[0], |     compress.CompressExchange(&pointers[0][0], | ||||||
| 			      j,so+table[2*j].second,so+table[2*j+1].second,type); | 			      &pointers[1][0], | ||||||
|  | 			      &rhs_v[0], | ||||||
|  | 			      j, | ||||||
|  | 			      so+table[2*j].second, | ||||||
|  | 			      so+table[2*j+1].second, | ||||||
|  | 			      type); | ||||||
|   }); |   }); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -579,7 +569,7 @@ public: | |||||||
|  |  | ||||||
|       mergetime-=usecond(); |       mergetime-=usecond(); | ||||||
|     for(int i=0;i<mm.size();i++){	 |     for(int i=0;i<mm.size();i++){	 | ||||||
|       thread_loop( (int o=0;o<mm[i].buffer_size/2;o++),{ |       thread_for(o,mm[i].buffer_size/2,{ | ||||||
| 	decompress.Exchange(mm[i].mpointer, | 	decompress.Exchange(mm[i].mpointer, | ||||||
| 			    mm[i].vpointers[0], | 			    mm[i].vpointers[0], | ||||||
| 			    mm[i].vpointers[1], | 			    mm[i].vpointers[1], | ||||||
| @@ -590,7 +580,7 @@ public: | |||||||
|  |  | ||||||
|     decompresstime-=usecond(); |     decompresstime-=usecond(); | ||||||
|     for(int i=0;i<dd.size();i++){	 |     for(int i=0;i<dd.size();i++){	 | ||||||
|       thread_loop( (int o=0;o<dd[i].buffer_size;o++),{ |       thread_for(o,dd[i].buffer_size,{ | ||||||
| 	decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o); | 	decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o); | ||||||
|       }); |       }); | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user