diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index c2bc8dab..269ecfe4 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -240,6 +240,19 @@ public: cobj * mpi_p; Integer buffer_size; }; + struct CopyReceiveBuffer { + void * from_p; + void * to_p; + Integer bytes; + }; + struct CachedTransfer { + Integer direction; + Integer OrthogPlane; + Integer DestProc; + Integer bytes; + Integer lane; + void *recv_buf; + }; protected: @@ -271,7 +284,8 @@ public: std::vector MergersSHM; std::vector Decompressions; std::vector DecompressionsSHM; - + std::vector CopyReceiveBuffers ; + std::vector CachedTransfers; /////////////////////////////////////////////////////////// // Unified Comms buffers for all directions /////////////////////////////////////////////////////////// @@ -551,8 +565,57 @@ public: Mergers.resize(0); MergersSHM.resize(0); Packets.resize(0); + CopyReceiveBuffers.resize(0); + CachedTransfers.resize(0); calls++; } + void AddCopy(void *from,void * to, Integer bytes) + { + CopyReceiveBuffer obj; + obj.from_p = from; + obj.to_p = to; + obj.bytes= bytes; + CopyReceiveBuffers.push_back(obj); + } + void CommsCopy() + { + // These are device resident MPI buffers. + for(int i=0;i void CommsMerge(decompressor decompress) { + CommsCopy(); CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { @@ -590,8 +654,8 @@ public: } template - void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { - + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) + { mergetime-=usecond(); for(int i=0;i>1; @@ -1045,9 +1111,10 @@ public: recv_buf=this->u_recv_buf_p; } + cobj *send_buf; send_buf = this->u_send_buf_p; // Gather locally, must send - + //////////////////////////////////////////////////////// // Gather locally //////////////////////////////////////////////////////// @@ -1056,23 +1123,27 @@ public: Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); - /////////////////////////////////////////////////////////// - // Build a list of things to do after we synchronise GPUs - // Start comms now??? - /////////////////////////////////////////////////////////// - AddPacket((void *)&send_buf[u_comm_offset], - (void *)&recv_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); + int duplicate = CheckForDuplicate(dimension,x,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes); + if (!duplicate || 1) { // Force comms for now + /////////////////////////////////////////////////////////// + // Build a list of things to do after we synchronise GPUs + // Start comms now??? + /////////////////////////////////////////////////////////// + AddPacket((void *)&send_buf[u_comm_offset], + (void *)&recv_buf[u_comm_offset], + xmit_to_rank, + recv_from_rank, + bytes); + } + if ( compress.DecompressionStep() ) { AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], words,Decompressions); } u_comm_offset+=words; - } + } } return 0; } @@ -1181,8 +1252,10 @@ public: rpointers[i] = rp; - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - + int duplicate = CheckForDuplicate(dimension,x,nbr_proc,(void *)rp,i,bytes); + if (!duplicate || 1 ) { // Force comms for now + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + } } else {