From 6c4da3bbc736a35a743877ce67ad2483c6a523e2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 18 May 2019 17:40:35 +0100 Subject: [PATCH] Stencil now runs with coalesced accesses --- Grid/stencil/Stencil.h | 105 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 10 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 58b0e764..a64db192 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -55,6 +55,45 @@ NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////// // Gather for when there *is* need to SIMD split with compression /////////////////////////////////////////////////////////////////// + +#ifdef __CUDA_ARCH__ +////////////////////////////////////////// +// EExtract and insert slices on the GPU +////////////////////////////////////////// +template accelerator_inline +typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + int lane = threadIdx.x % Nsimd; + return extractLane(lane,vec); +} +template accelerator_inline +void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + int lane = threadIdx.x % Nsimd; + insertLane(lane,vec,extracted); +} +#else +////////////////////////////////////////// +// Trivial mapping of vectors on host +////////////////////////////////////////// +template accelerator_inline +vobj coalescedRead(const vobj & __restrict__ vec) +{ + return vec; +} +template accelerator_inline +void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted) +{ + vec = extracted; +} +#endif + void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,Vector > & table); @@ -66,11 +105,30 @@ void Gather_plane_simple_table (Vector >& table,const Lattice { int num=table.size(); std::pair *table_v = & table[0]; - auto rhs_v = rhs.View(); + // auto tmp_ucc = coalescedRead(rhs_v[so+table_v[0].second]); + // coalescedWrite(rhs_v[so+table_v[0].second],tmp_ucc); +#if 1 + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type); + accelerator_loopNB( ii,num*Nsimd, { + + typedef decltype(coalescedRead(buffer[0])) compressed_t; + typedef decltype(coalescedRead(rhs_v [0])) uncompressed_t; + + int i = ii/Nsimd; + compressed_t tmp_c; + uncompressed_t tmp_uc = coalescedRead(rhs_v[so+table_v[i].second]); + uint64_t o = table_v[i].first; + compress.Compress(&tmp_c,0,tmp_uc); + coalescedWrite(buffer[off+o],tmp_c); + }); +#else accelerator_loopN( i,num, { compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]); }); +#endif // Further optimisatoin: i) streaming store the result // ii) software prefetch the first element of the next table entry } @@ -224,7 +282,7 @@ public: } int face_table_computed; - Vector > > face_table ; + std::vector > > face_table ; Vector surface_list; Vector _entries; // Resident in managed memory @@ -259,10 +317,12 @@ public: double mergetime; double decompresstime; double comms_bytes; + double shm_bytes; double splicetime; double nosplicetime; double calls; std::vector comm_bytes_thr; + std::vector shm_bytes_thr; std::vector comm_time_thr; std::vector comm_enter_thr; std::vector comm_leave_thr; @@ -326,6 +386,8 @@ public: Packets[i].from_rank, Packets[i].bytes,i); comm_bytes_thr[mythread] += bytes; + shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv. + } comm_leave_thr[mythread]= usecond(); comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; @@ -343,11 +405,13 @@ public: double t0 = comm_enter_thr[t]; double t1 = comm_leave_thr[t]; comms_bytes+=comm_bytes_thr[t]; + shm_bytes +=shm_bytes_thr[t]; comm_enter_thr[t] = 0.0; comm_leave_thr[t] = 0.0; comm_time_thr[t] = 0.0; comm_bytes_thr[t]=0; + shm_bytes_thr[t]=0; if ( first == 0.0 ) first = t0; // first is t0 if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen @@ -362,12 +426,14 @@ public: reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + uint64_t bytes=_grid->StencilSendToRecvFromBegin(reqs[i], Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes,i); + comms_bytes+=bytes; + shm_bytes +=2*Packets[i].bytes-bytes; } } @@ -391,12 +457,14 @@ public: if (mythread < nthreads) { for (int i = mythread; i < Packets.size(); i += nthreads) { double start = usecond(); - comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comm_time_thr[mythread] += usecond() - start; + uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comm_bytes_thr[mythread] += bytes; + shm_bytes_thr[mythread] += Packets[i].bytes - bytes; + comm_time_thr[mythread] += usecond() - start; } } } @@ -610,7 +678,8 @@ public: const std::vector &directions, const std::vector &distances, Parameters p) - : comm_bytes_thr(npoints), + : shm_bytes_thr(npoints), + comm_bytes_thr(npoints), comm_enter_thr(npoints), comm_leave_thr(npoints), comm_time_thr(npoints) @@ -1189,6 +1258,7 @@ public: comm_bytes_thr[i]=0; comm_enter_thr[i]=0; comm_leave_thr[i]=0; + shm_bytes_thr[i]=0; } halogtime = 0.; mergetime = 0.; @@ -1197,6 +1267,7 @@ public: splicetime = 0.; nosplicetime = 0.; comms_bytes = 0.; + shm_bytes = 0.; calls = 0.; }; @@ -1213,6 +1284,7 @@ public: if ( comm_time_thr[i]>0.0 ) { threaded = 1; comms_bytes += comm_bytes_thr[i]; + shm_bytes += shm_bytes_thr[i]; if (t < comm_time_thr[i]) t = comm_time_thr[i]; } } @@ -1232,6 +1304,19 @@ public: std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<1.0){ + PRINTIT(shm_bytes); // X bytes + R bytes + // Double this to include spin projection overhead with 2:1 ratio in wilson + auto gatheralltime = gathertime+gathermtime; + auto allbytes = comms_bytes+shm_bytes; + std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<