From 3906cd21495d6b756873f4cc9d90bb62fd1c377b Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 20 Feb 2017 17:51:31 -0500 Subject: [PATCH] Stencil fix on BNL KNL system --- lib/Stencil.cc | 6 +- lib/Stencil.h | 178 ++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 157 insertions(+), 27 deletions(-) diff --git a/lib/Stencil.cc b/lib/Stencil.cc index 16fb736f..c492efa0 100644 --- a/lib/Stencil.cc +++ b/lib/Stencil.cc @@ -29,19 +29,18 @@ namespace Grid { -void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, +void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) { table.resize(0); - int rd = grid->_rdimensions[dimension]; if ( !grid->CheckerBoarded(dimension) ) { cbmask = 0x3; } + int rd = grid->_rdimensions[dimension]; int so= plane*grid->_ostride[dimension]; // base offset for start of plane int e1=grid->_slice_nblock[dimension]; int e2=grid->_slice_block[dimension]; - int stride=grid->_slice_stride[dimension]; if ( cbmask == 0x3 ) { table.resize(e1*e2); @@ -66,4 +65,5 @@ void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,i } } } + } diff --git a/lib/Stencil.h b/lib/Stencil.h index 1821419a..e5afa251 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -29,7 +29,7 @@ #define GRID_STENCIL_H #include // subdir aggregate - +#define NEW_XYZT_GATHER ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need @@ -68,7 +68,10 @@ namespace Grid { -void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, +/////////////////////////////////////////////////////////////////// +// Gather for when there *is* need to SIMD split with compression +/////////////////////////////////////////////////////////////////// +void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); template @@ -85,6 +88,95 @@ PARALLEL_FOR_LOOP } } +/////////////////////////////////////////////////////////////////// +// Gather for when there *is* need to SIMD split with compression +/////////////////////////////////////////////////////////////////// +/* +template double +Gather_plane_exchange(const Lattice &rhs, + std::vector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) +{ + int rd = rhs._grid->_rdimensions[dimension]; + double t1,t2; + if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask = 0x3; + } + + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int e1 =rhs._grid->_slice_nblock[dimension]; + int e2 =rhs._grid->_slice_block [dimension]; + int n1 =rhs._grid->_slice_stride[dimension]; + + // Need to switch to a table loop + std::vector > table; + + if ( cbmask ==0x3){ + for(int n=0;n (offset,o+b)); + } + } + } else { + // Case of SIMD split AND checker dim cannot currently be hit, except in + // Test_cshift_red_black code. + for(int n=0;nCheckerBoardFromOindex(o+b); + int offset = b+n*e2; + + if ( ocb & cbmask ) { + table.push_back(std::pair (offset,o+b)); + } + } + } + } + + assert( (table.size()&0x1)==0); + t1=usecond(); +PARALLEL_FOR_LOOP + for(int j=0;j +void Gather_plane_exchange_table(const Lattice &rhs, + std::vector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); + +template +void Gather_plane_exchange_table(std::vector >& table,const Lattice &rhs, + std::vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type) +{ + assert( (table.size()&0x1)==0); + int num=table.size()/2; + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane +PARALLEL_FOR_LOOP + for(int j=0;jStencilSendToRecvFromBegin(reqs[i], + comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i], Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes); + if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySendrecv ) { + _grid->StencilSendToRecvFromComplete(reqs[i]); + } } commtime+=usecond(); } void CommunicateComplete(std::vector > &reqs) { commtime-=usecond(); - for(int i=0;iCommunicatorPolicy == CartesianCommunicator::CommunicatorPolicyIsend ) { + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + } } _grid->StencilBarrier();// Synch shared memory on a single nodes commtime+=usecond(); + /* int dump=1; if(dump){ for(int i=0;i u_simd_send_buf; - std::vector u_simd_recv_buf; std::vector new_simd_send_buf; std::vector new_simd_recv_buf; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; int u_comm_offset; int _unified_buffer_size; @@ -358,6 +455,10 @@ PARALLEL_FOR_LOOP void Report(void) { #define PRINTIT(A) \ std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; + RealD NN = _grid->NodeCount(); + if ( calls > 0. ) { std::cout << GridLogMessage << " Stencil calls "<1.0){ PRINTIT(comms_bytes); PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); - for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); - u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); +#ifdef NEW_XYZT_GATHER + for(int l=0;l<2;l++){ new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } +#else + for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); + u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); + } +#endif PrecomputeByteOffsets(); } @@ -740,7 +847,11 @@ PARALLEL_FOR_LOOP splicetime-=usecond(); // GatherSimd(source,dimension,shift,0x3,compress,face_idx); // std::cout << "GatherSimdNew"<>1; + + int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout; + assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj)); std::vector rpointers(maxl); std::vector spointers(maxl); @@ -1034,15 +1152,28 @@ PARALLEL_FOR_LOOP int any_offnode = ( ((x+sshift)%fd) >= rd ); if ( any_offnode ) { + for(int i=0;i > table; + t_table-=usecond(); + if ( !face_table_computed ) { + face_table.resize(face_idx+1); + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); + // std::cout << " face table size "<