1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Overlap comms & data copy/buffer assembly in Ghost zone exchange

This commit is contained in:
Peter Boyle 2023-10-20 19:23:00 -04:00
parent 4341d96bde
commit 9ab54c5565

View File

@ -46,6 +46,13 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
}; };
/*
*
* TODO:
* -- address elementsof vobj via thread block in Scatter/Gather
* -- overlap comms with motion in Face_exchange
*
*/
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf, template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
Lattice<vobj> &lat, Lattice<vobj> &lat,
@ -155,10 +162,6 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
//for cross platform //for cross platform
//For CPU perhaps just run a loop over Nsimd //For CPU perhaps just run a loop over Nsimd
auto buf_p = & buf[0]; auto buf_p = & buf[0];
std::cout << " simd["<<dim<<"] "<< simd[dim] <<std::endl;
std::cout << " simd "<< simd <<std::endl;
std::cout << " Nsimd "<< Nsimd <<std::endl;
std::cout << " rNsimd "<< rNsimd <<std::endl;
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{ accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce // scalar layout won't coalesce
@ -458,38 +461,51 @@ public:
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
std::vector<CommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
////////////////////////////////////////////////////////////////////////////
// Gather all surface terms up to depth "d"
////////////////////////////////////////////////////////////////////////////
RealD t=usecond();
int plane=0;
for ( int d=0;d < depth ; d ++ ) {
GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
}
for ( int d=0;d < depth ; d ++ ) {
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
}
t_gather= usecond() - t;
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Communicate // Communication coords
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
int comm_proc = 1; int comm_proc = 1;
int xmit_to_rank; int xmit_to_rank;
int recv_from_rank; int recv_from_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
t=usecond(); ////////////////////////////////////////////////////////////////////////////
for(int d = 0; d<depth;d++){ // Gather all surface terms up to depth "d"
grid->SendToRecvFrom((void *)&send_buf[d*buffer_size], xmit_to_rank, ////////////////////////////////////////////////////////////////////////////
(void *)&recv_buf[(d+depth)*buffer_size], recv_from_rank, bytes); RealD t;
int plane=0;
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+0;
grid->SendToRecvFrom((void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, t=usecond();
(void *)&recv_buf[d*buffer_size], xmit_to_rank, bytes); GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
t_gather+=usecond()-t;
t=usecond();
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+1;
t=usecond();
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
t_gather+= usecond() - t;
t=usecond();
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
t_comms+=usecond()-t;
} }
t_comms= usecond() - t;
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Copy interior -- overlap this with comms // Copy interior -- overlap this with comms
@ -507,24 +523,38 @@ public:
// Scatter all faces // Scatter all faces
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension); // DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
plane=0; plane=0;
t=usecond();
grid->CommsComplete(fwd_req);
t_comms+= usecond() - t;
t=usecond(); t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
}
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
for ( int d=0;d < depth ; d ++ ) { for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++; ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
} }
t_scatter= usecond() - t; t_scatter= usecond() - t;
// DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
t=usecond();
grid->CommsComplete(bwd_req);
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
}
t_scatter+= usecond() - t;
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl; // std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl; // std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl; // std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
} }
}; };
@ -532,3 +562,4 @@ public:
NAMESPACE_END(Grid); NAMESPACE_END(Grid);