diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 568fa65d..e50343d9 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -46,6 +46,13 @@ struct CshiftImplGauge: public CshiftImplBase inline void ScatterSlice(const cshiftVector &buf, Lattice &lat, @@ -155,10 +162,6 @@ template inline void GatherSlice(cshiftVector &buf, //for cross platform //For CPU perhaps just run a loop over Nsimd auto buf_p = & buf[0]; - std::cout << " simd["< fwd_req; + std::vector bwd_req; + int words = buffer_size; int bytes = words * sizeof(vobj); - //////////////////////////////////////////////////////////////////////////// - // Gather all surface terms up to depth "d" - //////////////////////////////////////////////////////////////////////////// - RealD t=usecond(); - int plane=0; - for ( int d=0;d < depth ; d ++ ) { - GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++; - } - for ( int d=0;d < depth ; d ++ ) { - GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++; - } - t_gather= usecond() - t; //////////////////////////////////////////////////////////////////////////// - // Communicate + // Communication coords //////////////////////////////////////////////////////////////////////////// int comm_proc = 1; int xmit_to_rank; int recv_from_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - t=usecond(); - for(int d = 0; dSendToRecvFrom((void *)&send_buf[d*buffer_size], xmit_to_rank, - (void *)&recv_buf[(d+depth)*buffer_size], recv_from_rank, bytes); - - grid->SendToRecvFrom((void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, - (void *)&recv_buf[d*buffer_size], xmit_to_rank, bytes); + //////////////////////////////////////////////////////////////////////////// + // Gather all surface terms up to depth "d" + //////////////////////////////////////////////////////////////////////////// + RealD t; + int plane=0; + for ( int d=0;d < depth ; d ++ ) { + int tag = d*1024 + dimension*2+0; + + t=usecond(); + GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++; + t_gather+=usecond()-t; + + t=usecond(); + grid->SendToRecvFromBegin(fwd_req, + (void *)&send_buf[d*buffer_size], xmit_to_rank, + (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); + t_comms+=usecond()-t; + } + for ( int d=0;d < depth ; d ++ ) { + int tag = d*1024 + dimension*2+1; + + t=usecond(); + GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++; + t_gather+= usecond() - t; + + t=usecond(); + grid->SendToRecvFromBegin(bwd_req, + (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, + (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); + t_comms+=usecond()-t; } - t_comms= usecond() - t; //////////////////////////////////////////////////////////////////////////// // Copy interior -- overlap this with comms @@ -507,24 +523,38 @@ public: // Scatter all faces //////////////////////////////////////////////////////////////////////////// // DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension); + plane=0; + + t=usecond(); + grid->CommsComplete(fwd_req); + t_comms+= usecond() - t; + t=usecond(); - for ( int d=0;d < depth ; d ++ ) { - ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++; - } - // DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension); for ( int d=0;d < depth ; d ++ ) { ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++; } t_scatter= usecond() - t; - // DumpSliceNorm(std::string("Face_exchange to done"),to,dimension); + + t=usecond(); + grid->CommsComplete(bwd_req); + t_comms+= usecond() - t; + + t=usecond(); + for ( int d=0;d < depth ; d ++ ) { + ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++; + } + t_scatter+= usecond() - t; + // DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension); + + //DumpSliceNorm(std::string("Face_exchange to done"),to,dimension); std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<