Stencil now runs with coalesced accesses

2026-01-11 12:29:33 +00:00 · 2019-05-18 17:40:35 +01:00
parent a584b16c4a
commit 6c4da3bbc7
1 changed files with 95 additions and 10 deletions
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,6 +55,45 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
+
+#ifdef __CUDA_ARCH__
+//////////////////////////////////////////
+// EExtract and insert slices on the GPU
+//////////////////////////////////////////
+template<class vobj> accelerator_inline
+typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
+  int lane = threadIdx.x % Nsimd;
+  return extractLane(lane,vec);
+}
+template<class vobj> accelerator_inline
+void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
+  int lane = threadIdx.x % Nsimd;
+  insertLane(lane,vec,extracted);
+}
+#else
+//////////////////////////////////////////
+// Trivial mapping of vectors on host
+//////////////////////////////////////////
+template<class vobj> accelerator_inline
+vobj coalescedRead(const vobj & __restrict__ vec)
+{
+  return vec;
+}
+template<class vobj> accelerator_inline
+void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted)
+{
+  vec = extracted;
+}
+#endif
+
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,Vector<std::pair<int,int> > & table);

@@ -66,11 +105,30 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
 {
  int num=table.size();
  std::pair<int,int> *table_v = & table[0];
-
  auto rhs_v = rhs.View();
+  //  auto tmp_ucc = coalescedRead(rhs_v[so+table_v[0].second]);
+  //  coalescedWrite(rhs_v[so+table_v[0].second],tmp_ucc);
+#if 1
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
+  accelerator_loopNB( ii,num*Nsimd, {
+
+      typedef decltype(coalescedRead(buffer[0])) compressed_t;
+      typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t;
+
+      int i = ii/Nsimd;
+      compressed_t   tmp_c;
+      uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]);
+      uint64_t o = table_v[i].first;
+      compress.Compress(&tmp_c,0,tmp_uc);
+      coalescedWrite(buffer[off+o],tmp_c);
+  });
+#else
  accelerator_loopN( i,num, {
      compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
  });
+#endif
 // Further optimisatoin: i) streaming store the result
 //                       ii) software prefetch the first element of the next table entry
 }
@@ -224,7 +282,7 @@ public:
  }
  
  int face_table_computed;
-  Vector<Vector<std::pair<int,int> > > face_table ;
+  std::vector<Vector<std::pair<int,int> > > face_table ;
  Vector<int> surface_list;

  Vector<StencilEntry>  _entries; // Resident in managed memory
@@ -259,10 +317,12 @@ public:
  double mergetime;
  double decompresstime;
  double comms_bytes;
+  double shm_bytes;
  double splicetime;
  double nosplicetime;
  double calls;
  std::vector<double> comm_bytes_thr;
+  std::vector<double> shm_bytes_thr;
  std::vector<double> comm_time_thr;
  std::vector<double> comm_enter_thr;
  std::vector<double> comm_leave_thr;
@@ -326,6 +386,8 @@ public:
 						      Packets[i].from_rank,
 						      Packets[i].bytes,i);
 	comm_bytes_thr[mythread] += bytes;
+	shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv.
+
      }
      comm_leave_thr[mythread]= usecond();
      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
@@ -343,11 +405,13 @@ public:
      double t0 = comm_enter_thr[t];
      double t1 = comm_leave_thr[t];
      comms_bytes+=comm_bytes_thr[t];
+      shm_bytes  +=shm_bytes_thr[t];

      comm_enter_thr[t] = 0.0;
      comm_leave_thr[t] = 0.0;
      comm_time_thr[t]   = 0.0;
      comm_bytes_thr[t]=0;
+      shm_bytes_thr[t]=0;

      if ( first == 0.0 ) first = t0;                   // first is t0
      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
@@ -362,12 +426,14 @@ public:
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
-      comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
+      uint64_t bytes=_grid->StencilSendToRecvFromBegin(reqs[i],
 						     Packets[i].send_buf,
 						     Packets[i].to_rank,
 						     Packets[i].recv_buf,
 						     Packets[i].from_rank,
 						     Packets[i].bytes,i);
+      comms_bytes+=bytes;
+      shm_bytes  +=2*Packets[i].bytes-bytes;
    }
  }

@@ -391,12 +457,14 @@ public:
      if (mythread < nthreads) {
 	for (int i = mythread; i < Packets.size(); i += nthreads) {
 	  double start = usecond();
-	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-								   Packets[i].to_rank,
-								   Packets[i].recv_buf,
-								   Packets[i].from_rank,
-								   Packets[i].bytes,i);
-	  comm_time_thr[mythread] += usecond() - start;
+	  uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+						       Packets[i].to_rank,
+						       Packets[i].recv_buf,
+						       Packets[i].from_rank,
+						       Packets[i].bytes,i);
+	  comm_bytes_thr[mythread] += bytes;
+	  shm_bytes_thr[mythread]  += Packets[i].bytes - bytes;
+	  comm_time_thr[mythread]  += usecond() - start;
 	}
      }
    }
@@ -610,7 +678,8 @@ public:
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
 		   Parameters p) 
-    : comm_bytes_thr(npoints), 
+    : shm_bytes_thr(npoints), 
+      comm_bytes_thr(npoints), 
      comm_enter_thr(npoints),
      comm_leave_thr(npoints), 
      comm_time_thr(npoints)
@@ -1189,6 +1258,7 @@ public:
      comm_bytes_thr[i]=0;
      comm_enter_thr[i]=0;
      comm_leave_thr[i]=0;
+      shm_bytes_thr[i]=0;
    }
    halogtime = 0.;
    mergetime = 0.;
@@ -1197,6 +1267,7 @@ public:
    splicetime = 0.;
    nosplicetime = 0.;
    comms_bytes = 0.;
+    shm_bytes = 0.;
    calls = 0.;
  };
  
@@ -1213,6 +1284,7 @@ public:
      if ( comm_time_thr[i]>0.0 ) {
 	threaded = 1;
 	comms_bytes += comm_bytes_thr[i];
+	shm_bytes   += shm_bytes_thr[i];
 	if (t < comm_time_thr[i]) t = comm_time_thr[i];
      }
    }
@@ -1232,6 +1304,19 @@ public:
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
+      if(shm_bytes>1.0){
+	PRINTIT(shm_bytes); // X bytes + R bytes
+	                    // Double this to include spin projection overhead with 2:1 ratio in wilson
+	auto gatheralltime = gathertime+gathermtime;
+	auto allbytes = comms_bytes+shm_bytes;
+	std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
+	std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
+
+	auto membytes = (shm_bytes + comms_bytes/2) // read/write
+	              + (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
+	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
+	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
+      }
      PRINTIT(mpi3synctime);
      PRINTIT(mpi3synctime_g);
      PRINTIT(shmmergetime);