mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-05 11:45:56 +01:00
Changing accelerator loop. Still have work to do for multi-GPU code
This commit is contained in:
parent
0074ef7f69
commit
cefaacbc07
@ -68,29 +68,14 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
|
||||
int num=table.size();
|
||||
std::pair<int,int> *table_v = & table[0];
|
||||
auto rhs_v = rhs.View();
|
||||
#ifdef GRID_NVCC
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
accelerator_loopNB( ii,num*Nsimd, {
|
||||
|
||||
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
||||
typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t;
|
||||
|
||||
int i = ii/Nsimd;
|
||||
compressed_t tmp_c;
|
||||
uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]);
|
||||
uint64_t o = table_v[i].first;
|
||||
compress.Compress(&tmp_c,0,tmp_uc);
|
||||
coalescedWrite(buffer[off+o],tmp_c);
|
||||
accelerator_forNB( i,num, vobj::Nsimd(), {
|
||||
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
||||
compressed_t tmp_c;
|
||||
uint64_t o = table_v[i].first;
|
||||
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
|
||||
coalescedWrite(buffer[off+o],tmp_c);
|
||||
});
|
||||
#else
|
||||
accelerator_loopN( i,num, {
|
||||
compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
|
||||
});
|
||||
#endif
|
||||
// Further optimisatoin: i) streaming store the result
|
||||
// ii) software prefetch the first element of the next table entry
|
||||
// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
@ -110,9 +95,14 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
|
||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
accelerator_loopN( j,num, {
|
||||
compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs_v[0],
|
||||
j,so+table[2*j].second,so+table[2*j+1].second,type);
|
||||
thread_for(j, num, {
|
||||
compress.CompressExchange(&pointers[0][0],
|
||||
&pointers[1][0],
|
||||
&rhs_v[0],
|
||||
j,
|
||||
so+table[2*j].second,
|
||||
so+table[2*j+1].second,
|
||||
type);
|
||||
});
|
||||
}
|
||||
|
||||
@ -579,7 +569,7 @@ public:
|
||||
|
||||
mergetime-=usecond();
|
||||
for(int i=0;i<mm.size();i++){
|
||||
thread_loop( (int o=0;o<mm[i].buffer_size/2;o++),{
|
||||
thread_for(o,mm[i].buffer_size/2,{
|
||||
decompress.Exchange(mm[i].mpointer,
|
||||
mm[i].vpointers[0],
|
||||
mm[i].vpointers[1],
|
||||
@ -590,7 +580,7 @@ public:
|
||||
|
||||
decompresstime-=usecond();
|
||||
for(int i=0;i<dd.size();i++){
|
||||
thread_loop( (int o=0;o<dd[i].buffer_size;o++),{
|
||||
thread_for(o,dd[i].buffer_size,{
|
||||
decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
|
||||
});
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user