diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 65d878cb..6296df4e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -398,6 +398,8 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // Buffers are gathered AND synchronised + // Copies are MPI ISend OR asynch copy on copy stream reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilBarrier();// Synch shared memory on a single nodes } void CommunicateComplete(std::vector > &reqs) { + // complete intranode + acceleratorCopySynchronise(); + // complete MPI for(int i=0;iStencilSendToRecvFromComplete(reqs[i],i); } + // Everyone agrees we are all done + _grid->StencilBarrier(); commtime+=usecond(); } //////////////////////////////////////////////////////////////////////// @@ -425,33 +431,9 @@ public: //////////////////////////////////////////////////////////////////////// void Communicate(void) { - if ( 0 ){ - thread_region { - // must be called in parallel region - int mythread = thread_num(); - int maxthreads= thread_max(); - int nthreads = CartesianCommunicator::nCommThreads; - assert(nthreads <= maxthreads); - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); - uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += Packets[i].bytes - bytes; - comm_time_thr[mythread] += usecond() - start; - } - } - } - } else { // Concurrent and non-threaded asynch calls to MPI - std::vector > reqs; - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - } + std::vector > reqs; + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); } template void HaloExchange(const Lattice &source,compressor &compress) @@ -527,7 +509,6 @@ public: _grid->StencilBarrier();// Synch shared memory on a single nodes mpi3synctime_g+=usecond(); - // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); halogtime-=usecond(); @@ -586,13 +567,8 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); - accelerator_barrier(); - _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); - CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); + assert(MergersSHM.size()==0); + assert(DecompressionsSHM.size()==0); } template