mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Simplify the comms structure prior to implementing Shared memory direct bouncs
This commit is contained in:
parent
910b8dd6a1
commit
0fcd2e7188
168
lib/Stencil.h
168
lib/Stencil.h
@ -70,20 +70,20 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor> void
|
inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
|
int off,std::vector<std::pair<int,int> > & table)
|
||||||
{
|
{
|
||||||
table.resize(0);
|
table.resize(0);
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !grid->CheckerBoarded(dimension) ) {
|
||||||
cbmask = 0x3;
|
cbmask = 0x3;
|
||||||
}
|
}
|
||||||
int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so= plane*grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=grid->_slice_block[dimension];
|
||||||
|
|
||||||
int stride=rhs._grid->_slice_stride[dimension];
|
int stride=grid->_slice_stride[dimension];
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
table.resize(e1*e2);
|
table.resize(e1*e2);
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
@ -99,7 +99,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &bu
|
|||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
|
int ocb=1<<grid->CheckerBoardFromOindexTable(o+b);
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
table[bo]=std::pair<int,int>(bo,o+b); bo++;
|
table[bo]=std::pair<int,int>(bo,o+b); bo++;
|
||||||
}
|
}
|
||||||
@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &bu
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor> void
|
template<class vobj,class cobj,class compressor> void
|
||||||
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,commVector<cobj> &buffer,
|
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
||||||
compressor &compress, int off,int so)
|
|
||||||
{
|
{
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int i=0;i<table.size();i++){
|
for(int i=0;i<table.size();i++){
|
||||||
@ -119,11 +118,11 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor> void
|
template<class vobj,class cobj,class compressor> void
|
||||||
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
|
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
|
||||||
double &t_table ,double & t_data )
|
double &t_table ,double & t_data )
|
||||||
{
|
{
|
||||||
std::vector<std::pair<int,int> > table;
|
std::vector<std::pair<int,int> > table;
|
||||||
Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
|
Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
Gather_plane_simple_table (table,rhs,buffer,compress,off,so);
|
Gather_plane_simple_table (table,rhs,buffer,compress,off,so);
|
||||||
}
|
}
|
||||||
@ -143,10 +142,11 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef uint32_t StencilInteger;
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef uint32_t StencilInteger;
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
typedef typename cobj::scalar_object scalar_object;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
|
typedef typename cobj::scalar_object scalar_object;
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Comms packet queue for asynch thread
|
// Comms packet queue for asynch thread
|
||||||
@ -158,7 +158,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
Integer to_rank;
|
Integer to_rank;
|
||||||
Integer from_rank;
|
Integer from_rank;
|
||||||
Integer bytes;
|
Integer bytes;
|
||||||
volatile Integer done;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
@ -166,81 +165,39 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
std::vector<std::vector<std::pair<int,int> > > face_table ;
|
std::vector<std::vector<std::pair<int,int> > > face_table ;
|
||||||
|
|
||||||
#define SEND_IMMEDIATE
|
|
||||||
#define SERIAL_SENDS
|
|
||||||
|
|
||||||
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
||||||
#ifdef SEND_IMMEDIATE
|
|
||||||
commtime-=usecond();
|
|
||||||
_grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
|
|
||||||
commtime+=usecond();
|
|
||||||
#endif
|
|
||||||
Packet p;
|
Packet p;
|
||||||
p.send_buf = xmit;
|
p.send_buf = xmit;
|
||||||
p.recv_buf = rcv;
|
p.recv_buf = rcv;
|
||||||
p.to_rank = to;
|
p.to_rank = to;
|
||||||
p.from_rank= from;
|
p.from_rank= from;
|
||||||
p.bytes = bytes;
|
p.bytes = bytes;
|
||||||
p.done = 0;
|
|
||||||
comms_bytes+=2.0*bytes;
|
comms_bytes+=2.0*bytes;
|
||||||
Packets.push_back(p);
|
Packets.push_back(p);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SERIAL_SENDS
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
void Communicate(void ) {
|
{
|
||||||
commtime-=usecond();
|
reqs.resize(Packets.size());
|
||||||
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
#ifndef SEND_IMMEDIATE
|
_grid->SendToRecvFromBegin(reqs[i],
|
||||||
_grid->SendToRecvFrom(
|
Packets[i].send_buf,
|
||||||
Packets[i].send_buf,
|
Packets[i].to_rank,
|
||||||
Packets[i].to_rank,
|
Packets[i].recv_buf,
|
||||||
Packets[i].recv_buf,
|
Packets[i].from_rank,
|
||||||
Packets[i].from_rank,
|
Packets[i].bytes);
|
||||||
Packets[i].bytes);
|
|
||||||
#endif
|
|
||||||
Packets[i].done = 1;
|
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
#else
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
void Communicate(void ) {
|
{
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
commtime-=usecond();
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
|
for(int i=0;i<Packets.size();i++){
|
||||||
commtime-=usecond();
|
_grid->SendToRecvFromComplete(reqs[i]);
|
||||||
const int concurrency=2;
|
|
||||||
for(int i=0;i<Packets.size();i+=concurrency){
|
|
||||||
for(int ii=0;ii<concurrency;ii++){
|
|
||||||
int j = i+ii;
|
|
||||||
if ( j<Packets.size() ) {
|
|
||||||
#ifndef SEND_IMMEDIATE
|
|
||||||
_grid->SendToRecvFromBegin(reqs[j],
|
|
||||||
Packets[j].send_buf,
|
|
||||||
Packets[j].to_rank,
|
|
||||||
Packets[j].recv_buf,
|
|
||||||
Packets[j].from_rank,
|
|
||||||
Packets[j].bytes);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int ii=0;ii<concurrency;ii++){
|
|
||||||
int j = i+ii;
|
|
||||||
if ( j<Packets.size() ) {
|
|
||||||
#ifndef SEND_IMMEDIATE
|
|
||||||
_grid->SendToRecvFromComplete(reqs[i]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int ii=0;ii<concurrency;ii++){
|
|
||||||
int j = i+ii;
|
|
||||||
if ( j<Packets.size() ) {
|
|
||||||
Packets[j].done = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Simd merge queue for asynch comms
|
// Simd merge queue for asynch comms
|
||||||
@ -260,36 +217,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
m.rpointers= rpointers;
|
m.rpointers= rpointers;
|
||||||
m.buffer_size = buffer_size;
|
m.buffer_size = buffer_size;
|
||||||
m.packet_id = packet_id;
|
m.packet_id = packet_id;
|
||||||
#ifdef SEND_IMMEDIATE
|
|
||||||
mergetime-=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int o=0;o<m.buffer_size;o++){
|
|
||||||
merge1(m.mpointer[o],m.rpointers,o);
|
|
||||||
}
|
|
||||||
mergetime+=usecond();
|
|
||||||
#else
|
|
||||||
Mergers.push_back(m);
|
Mergers.push_back(m);
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommsMerge(void ) {
|
void CommsMerge(void ) {
|
||||||
//PARALLEL_NESTED_LOOP2
|
|
||||||
for(int i=0;i<Mergers.size();i++){
|
for(int i=0;i<Mergers.size();i++){
|
||||||
|
|
||||||
spintime-=usecond();
|
|
||||||
int packet_id = Mergers[i].packet_id;
|
|
||||||
while(! Packets[packet_id].done ); // spin for completion
|
|
||||||
spintime+=usecond();
|
|
||||||
|
|
||||||
#ifndef SEND_IMMEDIATE
|
|
||||||
mergetime-=usecond();
|
mergetime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -346,11 +286,14 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
else return cbase + _entries[ent]._byte_offset;
|
else return cbase + _entries[ent]._byte_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
// Comms buffers
|
// Comms buffers
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
std::vector<commVector<scalar_object> > u_simd_send_buf;
|
std::vector<commVector<scalar_object> > u_simd_send_buf;
|
||||||
std::vector<commVector<scalar_object> > u_simd_recv_buf;
|
std::vector<commVector<scalar_object> > u_simd_recv_buf;
|
||||||
commVector<cobj> u_send_buf;
|
commVector<cobj> u_send_buf;
|
||||||
commVector<cobj> comm_buf;
|
commVector<cobj> comm_buf;
|
||||||
|
|
||||||
int u_comm_offset;
|
int u_comm_offset;
|
||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
|
|
||||||
@ -483,7 +426,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
u_send_buf.resize(_unified_buffer_size);
|
u_send_buf.resize(_unified_buffer_size);
|
||||||
comm_buf.resize(_unified_buffer_size);
|
comm_buf.resize(_unified_buffer_size);
|
||||||
|
|
||||||
PrecomputeByteOffsets();
|
PrecomputeByteOffsets();
|
||||||
|
|
||||||
@ -722,31 +665,16 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
calls++;
|
calls++;
|
||||||
Mergers.resize(0);
|
Mergers.resize(0);
|
||||||
Packets.resize(0);
|
Packets.resize(0);
|
||||||
HaloGather(source,compress);
|
HaloGather(source,compress);
|
||||||
this->Communicate();
|
this->CommunicateBegin(reqs);
|
||||||
|
this->CommunicateComplete(reqs);
|
||||||
CommsMerge(); // spins
|
CommsMerge(); // spins
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
// Overlapping comms and compute typically slows down compute and is useless
|
|
||||||
// unless memory bandwidth greatly exceeds network
|
|
||||||
template<class compressor>
|
|
||||||
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
|
||||||
Mergers.resize(0);
|
|
||||||
Packets.resize(0);
|
|
||||||
HaloGather(source,compress);
|
|
||||||
return std::thread([&] { this->Communicate(); });
|
|
||||||
}
|
|
||||||
void HaloExchangeComplete(std::thread &thr)
|
|
||||||
{
|
|
||||||
CommsMerge(); // spins
|
|
||||||
jointime-=usecond();
|
|
||||||
thr.join();
|
|
||||||
jointime+=usecond();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||||
{
|
{
|
||||||
@ -851,6 +779,9 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
|
cobj *u_send_buf_p;
|
||||||
|
cobj *comm_buf_p;
|
||||||
|
|
||||||
if (comm_proc) {
|
if (comm_proc) {
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
@ -863,16 +794,15 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
|
|||||||
if ( !face_table_computed ) {
|
if ( !face_table_computed ) {
|
||||||
t_table-=usecond();
|
t_table-=usecond();
|
||||||
face_table.resize(face_idx+1);
|
face_table.resize(face_idx+1);
|
||||||
Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
|
cobj *ptr; ptr = &u_send_buf[0];
|
||||||
|
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
|
||||||
|
face_table[face_idx]);
|
||||||
t_table+=usecond();
|
t_table+=usecond();
|
||||||
}
|
}
|
||||||
t_data-=usecond();
|
t_data-=usecond();
|
||||||
Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
|
Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++;
|
||||||
face_idx++;
|
|
||||||
t_data+=usecond();
|
t_data+=usecond();
|
||||||
gathertime+=usecond();
|
gathertime+=usecond();
|
||||||
|
|
||||||
// Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);
|
|
||||||
|
|
||||||
int rank = _grid->_processor;
|
int rank = _grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user