Changing accelerator loop. Still have work to do for multi-GPU code

2026-01-31 12:23:28 +00:00 · 2019-06-15 08:10:24 +01:00
parent 0074ef7f69
commit cefaacbc07
1 changed files with 17 additions and 27 deletions
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -68,29 +68,14 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
  int num=table.size();
  std::pair<int,int> *table_v = & table[0];
  auto rhs_v = rhs.View();
-#ifdef GRID_NVCC
+  accelerator_forNB( i,num, vobj::Nsimd(), {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
  accelerator_loopNB( ii,num*Nsimd, {
    typedef decltype(coalescedRead(buffer[0])) compressed_t;
      typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t;
      int i = ii/Nsimd;
    compressed_t   tmp_c;
      uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]);
    uint64_t o = table_v[i].first;
-      compress.Compress(&tmp_c,0,tmp_uc);
+    compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
    coalescedWrite(buffer[off+o],tmp_c);
  });
-#else
+// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
  accelerator_loopN( i,num, {
      compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
  });
 #endif
 // Further optimisatoin: i) streaming store the result
 //                       ii) software prefetch the first element of the next table entry
 }
 ///////////////////////////////////////////////////////////////////
@@ -110,9 +95,14 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane 
  auto rhs_v = rhs.View();
-  accelerator_loopN( j,num, {
+  thread_for(j, num, {
-    compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs_v[0],
+    compress.CompressExchange(&pointers[0][0],
-			      j,so+table[2*j].second,so+table[2*j+1].second,type);
+			      &pointers[1][0],
 			      &rhs_v[0],
 			      j,
 			      so+table[2*j].second,
 			      so+table[2*j+1].second,
 			      type);
  });
 }
@@ -579,7 +569,7 @@ public:
      mergetime-=usecond();
    for(int i=0;i<mm.size();i++){	
-      thread_loop( (int o=0;o<mm[i].buffer_size/2;o++),{
+      thread_for(o,mm[i].buffer_size/2,{
 	decompress.Exchange(mm[i].mpointer,
 			    mm[i].vpointers[0],
 			    mm[i].vpointers[1],
@@ -590,7 +580,7 @@ public:
    decompresstime-=usecond();
    for(int i=0;i<dd.size();i++){	
-      thread_loop( (int o=0;o<dd[i].buffer_size;o++),{
+      thread_for(o,dd[i].buffer_size,{
 	decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
      });
    }