mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Simplify the comms structure prior to implementing Shared memory direct bouncs
This commit is contained in:
parent
910b8dd6a1
commit
0fcd2e7188
168
lib/Stencil.h
168
lib/Stencil.h
@ -70,20 +70,20 @@
|
||||
|
||||
namespace Grid {
|
||||
|
||||
template<class vobj,class cobj,class compressor> void
|
||||
Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
|
||||
inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table)
|
||||
{
|
||||
table.resize(0);
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
if ( !grid->CheckerBoarded(dimension) ) {
|
||||
cbmask = 0x3;
|
||||
}
|
||||
int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
int so= plane*grid->_ostride[dimension]; // base offset for start of plane
|
||||
int e1=grid->_slice_nblock[dimension];
|
||||
int e2=grid->_slice_block[dimension];
|
||||
|
||||
int stride=rhs._grid->_slice_stride[dimension];
|
||||
int stride=grid->_slice_stride[dimension];
|
||||
if ( cbmask == 0x3 ) {
|
||||
table.resize(e1*e2);
|
||||
for(int n=0;n<e1;n++){
|
||||
@ -99,7 +99,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &bu
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*stride;
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
|
||||
int ocb=1<<grid->CheckerBoardFromOindexTable(o+b);
|
||||
if ( ocb &cbmask ) {
|
||||
table[bo]=std::pair<int,int>(bo,o+b); bo++;
|
||||
}
|
||||
@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &bu
|
||||
}
|
||||
|
||||
template<class vobj,class cobj,class compressor> void
|
||||
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,commVector<cobj> &buffer,
|
||||
compressor &compress, int off,int so)
|
||||
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
||||
{
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int i=0;i<table.size();i++){
|
||||
@ -119,11 +118,11 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
|
||||
template<class vobj,class cobj,class compressor> void
|
||||
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
|
||||
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
|
||||
double &t_table ,double & t_data )
|
||||
{
|
||||
std::vector<std::pair<int,int> > table;
|
||||
Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
|
||||
Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
Gather_plane_simple_table (table,rhs,buffer,compress,off,so);
|
||||
}
|
||||
@ -143,10 +142,11 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
public:
|
||||
|
||||
typedef uint32_t StencilInteger;
|
||||
typedef typename cobj::vector_type vector_type;
|
||||
typedef typename cobj::scalar_type scalar_type;
|
||||
typedef typename cobj::scalar_object scalar_object;
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
typedef uint32_t StencilInteger;
|
||||
typedef typename cobj::vector_type vector_type;
|
||||
typedef typename cobj::scalar_type scalar_type;
|
||||
typedef typename cobj::scalar_object scalar_object;
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Comms packet queue for asynch thread
|
||||
@ -158,7 +158,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
Integer to_rank;
|
||||
Integer from_rank;
|
||||
Integer bytes;
|
||||
volatile Integer done;
|
||||
};
|
||||
|
||||
std::vector<Packet> Packets;
|
||||
@ -166,81 +165,39 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
int face_table_computed;
|
||||
std::vector<std::vector<std::pair<int,int> > > face_table ;
|
||||
|
||||
#define SEND_IMMEDIATE
|
||||
#define SERIAL_SENDS
|
||||
|
||||
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
||||
#ifdef SEND_IMMEDIATE
|
||||
commtime-=usecond();
|
||||
_grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
|
||||
commtime+=usecond();
|
||||
#endif
|
||||
Packet p;
|
||||
p.send_buf = xmit;
|
||||
p.recv_buf = rcv;
|
||||
p.to_rank = to;
|
||||
p.from_rank= from;
|
||||
p.bytes = bytes;
|
||||
p.done = 0;
|
||||
comms_bytes+=2.0*bytes;
|
||||
Packets.push_back(p);
|
||||
|
||||
}
|
||||
|
||||
#ifdef SERIAL_SENDS
|
||||
void Communicate(void ) {
|
||||
commtime-=usecond();
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
reqs.resize(Packets.size());
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
#ifndef SEND_IMMEDIATE
|
||||
_grid->SendToRecvFrom(
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
#endif
|
||||
Packets[i].done = 1;
|
||||
_grid->SendToRecvFromBegin(reqs[i],
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
}
|
||||
commtime+=usecond();
|
||||
}
|
||||
#else
|
||||
void Communicate(void ) {
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
|
||||
commtime-=usecond();
|
||||
const int concurrency=2;
|
||||
for(int i=0;i<Packets.size();i+=concurrency){
|
||||
for(int ii=0;ii<concurrency;ii++){
|
||||
int j = i+ii;
|
||||
if ( j<Packets.size() ) {
|
||||
#ifndef SEND_IMMEDIATE
|
||||
_grid->SendToRecvFromBegin(reqs[j],
|
||||
Packets[j].send_buf,
|
||||
Packets[j].to_rank,
|
||||
Packets[j].recv_buf,
|
||||
Packets[j].from_rank,
|
||||
Packets[j].bytes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for(int ii=0;ii<concurrency;ii++){
|
||||
int j = i+ii;
|
||||
if ( j<Packets.size() ) {
|
||||
#ifndef SEND_IMMEDIATE
|
||||
_grid->SendToRecvFromComplete(reqs[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for(int ii=0;ii<concurrency;ii++){
|
||||
int j = i+ii;
|
||||
if ( j<Packets.size() ) {
|
||||
Packets[j].done = 1;
|
||||
}
|
||||
}
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
_grid->SendToRecvFromComplete(reqs[i]);
|
||||
}
|
||||
commtime+=usecond();
|
||||
}
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Simd merge queue for asynch comms
|
||||
@ -260,36 +217,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
m.rpointers= rpointers;
|
||||
m.buffer_size = buffer_size;
|
||||
m.packet_id = packet_id;
|
||||
#ifdef SEND_IMMEDIATE
|
||||
mergetime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int o=0;o<m.buffer_size;o++){
|
||||
merge1(m.mpointer[o],m.rpointers,o);
|
||||
}
|
||||
mergetime+=usecond();
|
||||
#else
|
||||
Mergers.push_back(m);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void CommsMerge(void ) {
|
||||
//PARALLEL_NESTED_LOOP2
|
||||
|
||||
for(int i=0;i<Mergers.size();i++){
|
||||
|
||||
spintime-=usecond();
|
||||
int packet_id = Mergers[i].packet_id;
|
||||
while(! Packets[packet_id].done ); // spin for completion
|
||||
spintime+=usecond();
|
||||
|
||||
#ifndef SEND_IMMEDIATE
|
||||
mergetime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||
}
|
||||
mergetime+=usecond();
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
@ -346,11 +286,14 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
else return cbase + _entries[ent]._byte_offset;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Comms buffers
|
||||
///////////////////////////////////////////////////////////
|
||||
std::vector<commVector<scalar_object> > u_simd_send_buf;
|
||||
std::vector<commVector<scalar_object> > u_simd_recv_buf;
|
||||
commVector<cobj> u_send_buf;
|
||||
commVector<cobj> comm_buf;
|
||||
|
||||
int u_comm_offset;
|
||||
int _unified_buffer_size;
|
||||
|
||||
@ -483,7 +426,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
}
|
||||
}
|
||||
u_send_buf.resize(_unified_buffer_size);
|
||||
comm_buf.resize(_unified_buffer_size);
|
||||
comm_buf.resize(_unified_buffer_size);
|
||||
|
||||
PrecomputeByteOffsets();
|
||||
|
||||
@ -722,31 +665,16 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
template<class compressor>
|
||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||
calls++;
|
||||
Mergers.resize(0);
|
||||
Packets.resize(0);
|
||||
HaloGather(source,compress);
|
||||
this->Communicate();
|
||||
this->CommunicateBegin(reqs);
|
||||
this->CommunicateComplete(reqs);
|
||||
CommsMerge(); // spins
|
||||
}
|
||||
#if 0
|
||||
// Overlapping comms and compute typically slows down compute and is useless
|
||||
// unless memory bandwidth greatly exceeds network
|
||||
template<class compressor>
|
||||
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
||||
Mergers.resize(0);
|
||||
Packets.resize(0);
|
||||
HaloGather(source,compress);
|
||||
return std::thread([&] { this->Communicate(); });
|
||||
}
|
||||
void HaloExchangeComplete(std::thread &thr)
|
||||
{
|
||||
CommsMerge(); // spins
|
||||
jointime-=usecond();
|
||||
thr.join();
|
||||
jointime+=usecond();
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class compressor>
|
||||
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||
{
|
||||
@ -851,6 +779,9 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
cobj *u_send_buf_p;
|
||||
cobj *comm_buf_p;
|
||||
|
||||
if (comm_proc) {
|
||||
|
||||
int words = buffer_size;
|
||||
@ -863,16 +794,15 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
||||
if ( !face_table_computed ) {
|
||||
t_table-=usecond();
|
||||
face_table.resize(face_idx+1);
|
||||
Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
|
||||
cobj *ptr; ptr = &u_send_buf[0];
|
||||
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
|
||||
face_table[face_idx]);
|
||||
t_table+=usecond();
|
||||
}
|
||||
t_data-=usecond();
|
||||
Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
|
||||
face_idx++;
|
||||
Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++;
|
||||
t_data+=usecond();
|
||||
gathertime+=usecond();
|
||||
|
||||
// Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
|
Loading…
Reference in New Issue
Block a user