From 485ad6fde09c1ae7150ecdd554be6865a2f3a348 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 7 Feb 2017 01:20:39 -0500 Subject: [PATCH] Stencil working in SHM MPI3 --- lib/Stencil.h | 126 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 49 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 82e818d2..71f086af 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -25,12 +25,10 @@ See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ - #ifndef GRID_STENCIL_H - #define GRID_STENCIL_H +#ifndef GRID_STENCIL_H +#define GRID_STENCIL_H - #include - - #include // subdir aggregate +#include // subdir aggregate ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient @@ -80,9 +78,10 @@ template void Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { int num=table.size(); - PARALLEL_FOR_LOOP +PARALLEL_FOR_LOOP for(int i=0;i class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -143,30 +144,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes); - /* - }else{ - _grid->SendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - } - */ } commtime+=usecond(); } void CommunicateComplete(std::vector > &reqs) { commtime-=usecond(); - for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); - // else - // _grid->SendToRecvFromComplete(reqs[i]); } + _grid->StencilBarrier();// Synch shared memory on a single nodes commtime+=usecond(); + /* + if(dump){ + for(int i=0;i_ndimension;d++){ + ss<<"."<<_grid->_processor_coor[d]; + } + ss<<"_mu_"<_ndimension;d++){ + // ss<<"."<<_grid->_processor_coor[d]; + // } + // ss<<"_m_"<_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); @@ -373,9 +394,11 @@ PARALLEL_FOR_LOOP int sshift[2]; + ////////////////////////// // Underlying approach. For each local site build // up a table containing the npoint "neighbours" and whether they // live in lattice or a comms buffer. + ////////////////////////// if ( !comm_dim ) { sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); @@ -386,11 +409,11 @@ PARALLEL_FOR_LOOP Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Local(point,dimension,shift,0x2);// both with block stride loop iteration } - } else { // All permute extract done in comms phase prior to Stencil application + } else { + // All permute extract done in comms phase prior to Stencil application // So tables are the same whether comm_dim or splice_dim sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { Comms(point,dimension,shift,0x3); } else { @@ -482,9 +505,11 @@ PARALLEL_FOR_LOOP assert(shift>=0); assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored - + // done in reduced dims, so SIMD factored + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; + _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and + // send to one or more remote nodes. int cb= (cbmask==0x2)? Odd : Even; @@ -707,6 +732,8 @@ PARALLEL_FOR_LOOP template void HaloGather(const Lattice &source,compressor &compress) { + _grid->StencilBarrier();// Synch shared memory on a single nodes + // conformable(source._grid,_grid); assert(source._grid==_grid); halogtime-=usecond(); @@ -767,8 +794,7 @@ PARALLEL_FOR_LOOP if ( !face_table_computed ) { t_table-=usecond(); face_table.resize(face_idx+1); - Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, - face_table[face_idx]); + Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); t_table+=usecond(); } @@ -789,12 +815,11 @@ PARALLEL_FOR_LOOP cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); if ( send_buf==NULL ) { send_buf = u_send_buf_p; - } - // std::cout << " send_bufs "< rpointers(Nsimd); std::vector spointers(Nsimd); - + + // std::cout << "GatherSimd " << dimension << " shift "<= rd ); - + if ( any_offnode ) { for(int i=0;i2 - // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<2 + // for(int w=0;w : lane " << i <<" elem "<>(permute_type+1)); int ic= (i&inner_bit)? 1:0; - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors int nbr_lcoor= (nbr_coor%ld); int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer @@ -885,10 +912,10 @@ PARALLEL_FOR_LOOP if (nbr_ic) nbr_lane|=inner_bit; assert (sx == nbr_ox); - + auto rp = &u_simd_recv_buf[i ][u_comm_offset]; auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - + if(nbr_proc){ int recv_from_rank; @@ -896,16 +923,17 @@ PARALLEL_FOR_LOOP _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + // shm == receive pointer if offnode + // shm == Translate[send pointer] if on node -- my view of his send pointer scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); - // if ((ShmDirectCopy==0)||(shm==NULL)) { if (shm==NULL) { shm = rp; - } - + } + // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node // assuming above pointer flip AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - + rpointers[i] = shm; } else {