1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Changing accelerator loop. Still have work to do for multi-GPU code

This commit is contained in:
Peter Boyle 2019-06-15 08:10:24 +01:00
parent 0074ef7f69
commit cefaacbc07

View File

@ -68,29 +68,14 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
int num=table.size(); int num=table.size();
std::pair<int,int> *table_v = & table[0]; std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(); auto rhs_v = rhs.View();
#ifdef GRID_NVCC accelerator_forNB( i,num, vobj::Nsimd(), {
typedef typename vobj::scalar_type scalar_type; typedef decltype(coalescedRead(buffer[0])) compressed_t;
typedef typename vobj::vector_type vector_type; compressed_t tmp_c;
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); uint64_t o = table_v[i].first;
accelerator_loopNB( ii,num*Nsimd, { compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
coalescedWrite(buffer[off+o],tmp_c);
typedef decltype(coalescedRead(buffer[0])) compressed_t;
typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t;
int i = ii/Nsimd;
compressed_t tmp_c;
uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]);
uint64_t o = table_v[i].first;
compress.Compress(&tmp_c,0,tmp_uc);
coalescedWrite(buffer[off+o],tmp_c);
}); });
#else // Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
accelerator_loopN( i,num, {
compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
});
#endif
// Further optimisatoin: i) streaming store the result
// ii) software prefetch the first element of the next table entry
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -110,9 +95,14 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(); auto rhs_v = rhs.View();
accelerator_loopN( j,num, { thread_for(j, num, {
compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs_v[0], compress.CompressExchange(&pointers[0][0],
j,so+table[2*j].second,so+table[2*j+1].second,type); &pointers[1][0],
&rhs_v[0],
j,
so+table[2*j].second,
so+table[2*j+1].second,
type);
}); });
} }
@ -579,7 +569,7 @@ public:
mergetime-=usecond(); mergetime-=usecond();
for(int i=0;i<mm.size();i++){ for(int i=0;i<mm.size();i++){
thread_loop( (int o=0;o<mm[i].buffer_size/2;o++),{ thread_for(o,mm[i].buffer_size/2,{
decompress.Exchange(mm[i].mpointer, decompress.Exchange(mm[i].mpointer,
mm[i].vpointers[0], mm[i].vpointers[0],
mm[i].vpointers[1], mm[i].vpointers[1],
@ -590,7 +580,7 @@ public:
decompresstime-=usecond(); decompresstime-=usecond();
for(int i=0;i<dd.size();i++){ for(int i=0;i<dd.size();i++){
thread_loop( (int o=0;o<dd[i].buffer_size;o++),{ thread_for(o,dd[i].buffer_size,{
decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o); decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
}); });
} }