mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Overlap comms & data copy/buffer assembly in Ghost zone exchange
This commit is contained in:
parent
4341d96bde
commit
9ab54c5565
@ -46,6 +46,13 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* TODO:
|
||||||
|
* -- address elementsof vobj via thread block in Scatter/Gather
|
||||||
|
* -- overlap comms with motion in Face_exchange
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||||
Lattice<vobj> &lat,
|
Lattice<vobj> &lat,
|
||||||
@ -155,10 +162,6 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
|||||||
//for cross platform
|
//for cross platform
|
||||||
//For CPU perhaps just run a loop over Nsimd
|
//For CPU perhaps just run a loop over Nsimd
|
||||||
auto buf_p = & buf[0];
|
auto buf_p = & buf[0];
|
||||||
std::cout << " simd["<<dim<<"] "<< simd[dim] <<std::endl;
|
|
||||||
std::cout << " simd "<< simd <<std::endl;
|
|
||||||
std::cout << " Nsimd "<< Nsimd <<std::endl;
|
|
||||||
std::cout << " rNsimd "<< rNsimd <<std::endl;
|
|
||||||
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||||
|
|
||||||
// scalar layout won't coalesce
|
// scalar layout won't coalesce
|
||||||
@ -458,38 +461,51 @@ public:
|
|||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
|
|
||||||
|
std::vector<CommsRequest_t> fwd_req;
|
||||||
|
std::vector<CommsRequest_t> bwd_req;
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Gather all surface terms up to depth "d"
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
RealD t=usecond();
|
|
||||||
int plane=0;
|
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
|
||||||
GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
|
|
||||||
}
|
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
|
||||||
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
|
|
||||||
}
|
|
||||||
t_gather= usecond() - t;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// Communicate
|
// Communication coords
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
int comm_proc = 1;
|
int comm_proc = 1;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
t=usecond();
|
////////////////////////////////////////////////////////////////////////////
|
||||||
for(int d = 0; d<depth;d++){
|
// Gather all surface terms up to depth "d"
|
||||||
grid->SendToRecvFrom((void *)&send_buf[d*buffer_size], xmit_to_rank,
|
////////////////////////////////////////////////////////////////////////////
|
||||||
(void *)&recv_buf[(d+depth)*buffer_size], recv_from_rank, bytes);
|
RealD t;
|
||||||
|
int plane=0;
|
||||||
grid->SendToRecvFrom((void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
(void *)&recv_buf[d*buffer_size], xmit_to_rank, bytes);
|
int tag = d*1024 + dimension*2+0;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
|
GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
|
||||||
|
t_gather+=usecond()-t;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
|
grid->SendToRecvFromBegin(fwd_req,
|
||||||
|
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||||
|
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||||
|
t_comms+=usecond()-t;
|
||||||
|
}
|
||||||
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
|
int tag = d*1024 + dimension*2+1;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
|
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
|
||||||
|
t_gather+= usecond() - t;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
|
grid->SendToRecvFromBegin(bwd_req,
|
||||||
|
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||||
|
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||||
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
t_comms= usecond() - t;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// Copy interior -- overlap this with comms
|
// Copy interior -- overlap this with comms
|
||||||
@ -507,24 +523,38 @@ public:
|
|||||||
// Scatter all faces
|
// Scatter all faces
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
|
// DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
|
||||||
|
|
||||||
plane=0;
|
plane=0;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
|
grid->CommsComplete(fwd_req);
|
||||||
|
t_comms+= usecond() - t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
|
||||||
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
|
|
||||||
}
|
|
||||||
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
|
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
||||||
}
|
}
|
||||||
t_scatter= usecond() - t;
|
t_scatter= usecond() - t;
|
||||||
// DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
|
||||||
|
t=usecond();
|
||||||
|
grid->CommsComplete(bwd_req);
|
||||||
|
t_comms+= usecond() - t;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
|
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
|
||||||
|
}
|
||||||
|
t_scatter+= usecond() - t;
|
||||||
|
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
|
||||||
|
|
||||||
|
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -532,3 +562,4 @@ public:
|
|||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user