Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM

2025-07-27 17:57:08 +01:00 · 2022-10-04 11:11:10 -07:00
parent 9296299b61
commit e1e5c75023
1 changed files with 19 additions and 13 deletions
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
 ///////////////////////////////////////////////////////////////////
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
-				 commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
+				 commVector<cobj *> pointers,
+				 int dimension,int plane,
+				 int cbmask,compressor &compress,int type) __attribute__((noinline));

 template<class cobj,class vobj,class compressor>
-void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				 Vector<cobj *> pointers,int dimension,int plane,int cbmask,
+void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
+				 const Lattice<vobj> &rhs,
+				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
 				 compressor &compress,int type)
 {
  assert( (table.size()&0x1)==0);
@@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane

  auto rhs_v = rhs.View(AcceleratorRead);
+  auto rhs_p = &rhs_v[0];
  auto p0=&pointers[0][0];
  auto p1=&pointers[1][0];
  auto tp=&table[0];
  accelerator_forNB(j, num, vobj::Nsimd(), {
-      compress.CompressExchange(p0,p1, &rhs_v[0], j,
-			      so+tp[2*j  ].second,
-			      so+tp[2*j+1].second,
-			      type);
+      compress.CompressExchange(p0,p1, rhs_p, j,
+				so+tp[2*j  ].second,
+				so+tp[2*j+1].second,
+				type);
  });
  rhs_v.ViewClose();
 }
@@ -230,8 +234,8 @@ public:
  };
  struct Merge {
    cobj * mpointer;
-    Vector<scalar_object *> rpointers;
-    Vector<cobj *> vpointers;
+    //    std::vector<scalar_object *> rpointers;
+    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
  };
@@ -406,6 +410,7 @@ public:
      comms_bytes+=bytes;
      shm_bytes  +=2*Packets[i].bytes-bytes;
    }
+    _grid->StencilBarrier();// Synch shared memory on a single nodes
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@@ -420,7 +425,7 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
+    if ( 0 ){
      thread_region {
 	// must be called in parallel region
 	int mythread  = thread_num();
@@ -569,7 +574,7 @@ public:
    d.buffer_size = buffer_size;
    dv.push_back(d);
  }
-  void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
+  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
    m.type     = type;
    m.mpointer = merge_p;
@@ -582,6 +587,7 @@ public:
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
    mpi3synctime-=usecond();
+    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime+=usecond();
    shmmergetime-=usecond();
@@ -1114,8 +1120,8 @@ public:
    int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
    assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);

-    Vector<cobj *> rpointers(maxl);
-    Vector<cobj *> spointers(maxl);
+    std::vector<cobj *> rpointers(maxl);
+    std::vector<cobj *> spointers(maxl);

    ///////////////////////////////////////////
    // Work out what to send where