diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index e23ff258..dd8b646e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -59,6 +59,7 @@ NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); +/* template void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); @@ -103,6 +104,7 @@ void Gather_plane_exchange_table(commVector >& table,const La }); rhs_v.ViewClose(); } +*/ struct StencilEntry { #ifdef GRID_CUDA @@ -133,8 +135,18 @@ class CartesianStencilAccelerator { int _osites; StencilVector _directions; StencilVector _distances; - StencilVector _comms_send; - StencilVector _comms_recv; + /////////////////////////////////////////////////// + // If true, this is FULLY communicated per face + // Otherwise will either be full or partial dirichlet + /////////////////////////////////////////////////// + StencilVector _comms_send; + StencilVector _comms_recv; // this is FULLY communicated per face + /////////////////////////////////////////////////// + // If true, this is partially communicated per face + /////////////////////////////////////////////////// + StencilVector _comms_partial_send; + StencilVector _comms_partial_recv; + // StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; @@ -229,7 +241,8 @@ public: Integer from_rank; Integer do_send; Integer do_recv; - Integer bytes; + Integer xbytes; + Integer rbytes; }; struct Merge { cobj * mpointer; @@ -237,11 +250,15 @@ public: Vector vpointers; Integer buffer_size; Integer type; + Integer partial; // partial dirichlet BCs + Coordinate dims; }; struct Decompress { cobj * kernel_p; cobj * mpi_p; Integer buffer_size; + Integer partial; // partial dirichlet BCs + Coordinate dims; }; struct CopyReceiveBuffer { void * from_p; @@ -252,7 +269,8 @@ public: Integer direction; Integer OrthogPlane; Integer DestProc; - Integer bytes; + Integer xbytes; + Integer rbytes; Integer lane; Integer cb; void *recv_buf; @@ -277,6 +295,7 @@ public: } int face_table_computed; + int partialDirichlet; std::vector > > face_table ; Vector surface_list; @@ -365,7 +384,7 @@ public: Packets[i].to_rank,Packets[i].do_send, Packets[i].recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].bytes,i); + Packets[i].xbytes,Packets[i].rbytes,i); } } @@ -501,7 +520,9 @@ public: } } - Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb) + Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane, + Integer xbytes,Integer rbytes, + Integer cb) { CachedTransfer obj; obj.direction = direction; @@ -509,19 +530,22 @@ public: obj.DestProc = DestProc; obj.recv_buf = recv_buf; obj.lane = lane; - obj.bytes = bytes; + obj.xbytes = xbytes; + obj.rbytes = rbytes; obj.cb = cb; for(int i=0;i &dv) { Decompress d; + d.partial = this->partialDirichlet; + d.dims = _grid->_fdimensions; d.kernel_p = k_p; d.mpi_p = m_p; d.buffer_size = buffer_size; @@ -552,6 +581,8 @@ public: } void AddMerge(cobj *merge_p,Vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; + m.partial = this->partialDirichlet; + m.dims = _grid->_fdimensions; m.type = type; m.mpointer = merge_p; m.vpointers= rpointers; @@ -571,21 +602,10 @@ public: void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { for(int i=0;i_comms_send[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; } + if ( partialDirichlet ) { + this->_comms_partial_send[ii] = !this->_comms_send[ii]; + this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; + } } } } @@ -691,7 +715,7 @@ public: this->same_node.resize(npoints); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); - + partialDirichlet = p.partialDirichlet; DirichletBlock(p.dirichlet); // comms send/recv set up _unified_buffer_size=0; @@ -827,7 +851,7 @@ public: GridBase *grid=_grid; const int Nsimd = grid->Nsimd(); - int comms_recv = this->_comms_recv[point]; + int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1014,8 +1038,10 @@ public: { typedef typename cobj::vector_type vector_type; - int comms_send = this->_comms_send[point] ; - int comms_recv = this->_comms_recv[point] ; + int comms_send = this->_comms_send[point]; + int comms_recv = this->_comms_recv[point]; + int comms_partial_send = this->_comms_partial_send[point] ; + int comms_partial_recv = this->_comms_partial_recv[point] ; assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1046,7 +1072,17 @@ public: if (cbmask != 0x3) words=words>>1; int bytes = words * compress.CommDatumSize(); + int xbytes; + int rbytes; + if ( comms_send ) xbytes = bytes; // Full send + else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); + else xbytes = 0; // full dirichlet + + if ( comms_recv ) rbytes = bytes; + else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); + else rbytes = 0; + int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int comm_off = u_comm_offset; @@ -1059,49 +1095,47 @@ public: assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - if( comms_send ) { - - if ( !face_table_computed ) { - face_table.resize(face_idx+1); - std::vector > face_table_host ; - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); + if ( !face_table_computed ) { + face_table.resize(face_idx+1); + std::vector > face_table_host ; + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); + // std::cout << "bytes expect "<< bytes << " " << face_table_host.size()* compress.CommDatumSize()<u_recv_buf_p; - } - - send_buf = this->u_send_buf_p; // Gather locally, must send - - //////////////////////////////////////////////////////// - // Gather locally - //////////////////////////////////////////////////////// - assert(send_buf!=NULL); - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so); } - int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask); - if ( (!duplicate) ) { // Force comms for now + if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { + recv_buf=u_simd_recv_buf[0]; + } else { + recv_buf=this->u_recv_buf_p; + } + + //////////////////////////////////////////////////////// + // Gather locally + //////////////////////////////////////////////////////// + send_buf = this->u_send_buf_p; // Gather locally, must send + assert(send_buf!=NULL); + + compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send); + + int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask); + if ( !duplicate ) { // Force comms for now + /////////////////////////////////////////////////////////// // Build a list of things to do after we synchronise GPUs // Start comms now??? /////////////////////////////////////////////////////////// AddPacket((void *)&send_buf[comm_off], (void *)&recv_buf[comm_off], - xmit_to_rank, comms_send, - recv_from_rank, comms_recv, - bytes); + xmit_to_rank, comms_send|comms_partial_send, + recv_from_rank, comms_recv|comms_partial_recv, + xbytes,rbytes); } - - if ( compress.DecompressionStep() && comms_recv ) { + + if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) { AddDecompress(&this->u_recv_buf_p[comm_off], &recv_buf[comm_off], words,Decompressions); @@ -1109,7 +1143,6 @@ public: u_comm_offset+=words; face_idx++; - } } return 0; @@ -1122,8 +1155,10 @@ public: const int maxl =2;// max layout in a direction - int comms_send = this->_comms_send[point] ; - int comms_recv = this->_comms_recv[point] ; + int comms_send = this->_comms_send[point]; + int comms_recv = this->_comms_recv[point]; + int comms_partial_send = this->_comms_partial_send[point] ; + int comms_partial_recv = this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1153,6 +1188,11 @@ public: int datum_bytes = compress.CommDatumSize(); int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; + + // how many bytes on wire : partial dirichlet or dirichlet may set to < bytes + int xbytes; + int rbytes; + assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); Vector rpointers(maxl); @@ -1182,22 +1222,37 @@ public: if ( !face_table_computed ) { face_table.resize(face_idx+1); std::vector > face_table_host ; - + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); face_table[face_idx].resize(face_table_host.size()); acceleratorCopyToDevice(&face_table_host[0], &face_table[face_idx][0], face_table[face_idx].size()*sizeof(face_table_host[0])); + } - if ( comms_send || comms_recv ) { - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + + if ( comms_send ) xbytes = bytes; + else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); + else xbytes = 0; + + if ( comms_recv ) rbytes = bytes; + else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); + else rbytes = 0; + + // Gathers SIMD lanes for send and merge + // Different faces can be full comms or partial comms with multiple ranks per node + if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { + + int partial = partialDirichlet; + compressor::Gather_plane_exchange(face_table[face_idx],rhs, + spointers,dimension,sx,cbmask, + compress,permute_type,partial ); } face_idx++; - //spointers[0] -- low - //spointers[1] -- high - + //spointers[0] -- low simd coor + //spointers[1] -- high simd coor for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); rpointers[i] = rp; - - int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask); + + int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask); if ( !duplicate ) { + if ( (bytes != rbytes) && (rbytes!=0) ){ + acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero + } AddPacket((void *)sp,(void *)rp, - xmit_to_rank,comms_send, - recv_from_rank,comms_recv, - bytes); + xmit_to_rank,comms_send|comms_partial_send, + recv_from_rank,comms_recv|comms_partial_recv, + xbytes,rbytes); } } else { @@ -1238,7 +1296,7 @@ public: } } - if ( comms_recv ) { + if ( comms_recv|comms_partial_recv ) { AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); }