From cefaacbc0737383a0092168043230faf6ba8c3da Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 15 Jun 2019 08:10:24 +0100 Subject: [PATCH] Changing accelerator loop. Still have work to do for multi-GPU code --- Grid/stencil/Stencil.h | 44 ++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index ae0f889b..f7b31fed 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -68,29 +68,14 @@ void Gather_plane_simple_table (Vector >& table,const Lattice int num=table.size(); std::pair *table_v = & table[0]; auto rhs_v = rhs.View(); -#ifdef GRID_NVCC - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); - accelerator_loopNB( ii,num*Nsimd, { - - typedef decltype(coalescedRead(buffer[0])) compressed_t; - typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t; - - int i = ii/Nsimd; - compressed_t tmp_c; - uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]); - uint64_t o = table_v[i].first; - compress.Compress(&tmp_c,0,tmp_uc); - coalescedWrite(buffer[off+o],tmp_c); + accelerator_forNB( i,num, vobj::Nsimd(), { + typedef decltype(coalescedRead(buffer[0])) compressed_t; + compressed_t tmp_c; + uint64_t o = table_v[i].first; + compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); + coalescedWrite(buffer[off+o],tmp_c); }); -#else - accelerator_loopN( i,num, { - compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]); - }); -#endif -// Further optimisatoin: i) streaming store the result -// ii) software prefetch the first element of the next table entry +// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table } /////////////////////////////////////////////////////////////////// @@ -110,9 +95,14 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane auto rhs_v = rhs.View(); - accelerator_loopN( j,num, { - compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs_v[0], - j,so+table[2*j].second,so+table[2*j+1].second,type); + thread_for(j, num, { + compress.CompressExchange(&pointers[0][0], + &pointers[1][0], + &rhs_v[0], + j, + so+table[2*j].second, + so+table[2*j+1].second, + type); }); } @@ -579,7 +569,7 @@ public: mergetime-=usecond(); for(int i=0;i