1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Partial dirichlet BCs

This commit is contained in:
Peter Boyle 2022-11-15 16:24:26 -05:00
parent 0db4f1803f
commit e2e269e03b

View File

@ -59,6 +59,7 @@ NAMESPACE_BEGIN(Grid);
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table);
/*
template<class vobj,class cobj,class compressor>
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline));
@ -103,6 +104,7 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
});
rhs_v.ViewClose();
}
*/
struct StencilEntry {
#ifdef GRID_CUDA
@ -133,8 +135,18 @@ class CartesianStencilAccelerator {
int _osites;
StencilVector _directions;
StencilVector _distances;
StencilVector _comms_send;
StencilVector _comms_recv;
///////////////////////////////////////////////////
// If true, this is FULLY communicated per face
// Otherwise will either be full or partial dirichlet
///////////////////////////////////////////////////
StencilVector _comms_send;
StencilVector _comms_recv; // this is FULLY communicated per face
///////////////////////////////////////////////////
// If true, this is partially communicated per face
///////////////////////////////////////////////////
StencilVector _comms_partial_send;
StencilVector _comms_partial_recv;
//
StencilVector _comm_buf_size;
StencilVector _permute_type;
StencilVector same_node;
@ -229,7 +241,8 @@ public:
Integer from_rank;
Integer do_send;
Integer do_recv;
Integer bytes;
Integer xbytes;
Integer rbytes;
};
struct Merge {
cobj * mpointer;
@ -237,11 +250,15 @@ public:
Vector<cobj *> vpointers;
Integer buffer_size;
Integer type;
Integer partial; // partial dirichlet BCs
Coordinate dims;
};
struct Decompress {
cobj * kernel_p;
cobj * mpi_p;
Integer buffer_size;
Integer partial; // partial dirichlet BCs
Coordinate dims;
};
struct CopyReceiveBuffer {
void * from_p;
@ -252,7 +269,8 @@ public:
Integer direction;
Integer OrthogPlane;
Integer DestProc;
Integer bytes;
Integer xbytes;
Integer rbytes;
Integer lane;
Integer cb;
void *recv_buf;
@ -277,6 +295,7 @@ public:
}
int face_table_computed;
int partialDirichlet;
std::vector<commVector<std::pair<int,int> > > face_table ;
Vector<int> surface_list;
@ -365,7 +384,7 @@ public:
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].bytes,i);
Packets[i].xbytes,Packets[i].rbytes,i);
}
}
@ -501,7 +520,9 @@ public:
}
}
Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb)
Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,
Integer xbytes,Integer rbytes,
Integer cb)
{
CachedTransfer obj;
obj.direction = direction;
@ -509,19 +530,22 @@ public:
obj.DestProc = DestProc;
obj.recv_buf = recv_buf;
obj.lane = lane;
obj.bytes = bytes;
obj.xbytes = xbytes;
obj.rbytes = rbytes;
obj.cb = cb;
for(int i=0;i<CachedTransfers.size();i++){
if ( (CachedTransfers[i].direction ==direction)
&&(CachedTransfers[i].OrthogPlane==OrthogPlane)
&&(CachedTransfers[i].DestProc ==DestProc)
&&(CachedTransfers[i].bytes ==bytes)
&&(CachedTransfers[i].xbytes ==xbytes)
&&(CachedTransfers[i].rbytes ==rbytes)
&&(CachedTransfers[i].lane ==lane)
&&(CachedTransfers[i].cb ==cb)
){
AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
// FIXME worry about duplicate with partial compression
// Wont happen as DWF has no duplicates, but...
AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
return 1;
}
}
@ -532,7 +556,7 @@ public:
void AddPacket(void *xmit,void * rcv,
Integer to, Integer do_send,
Integer from, Integer do_recv,
Integer bytes){
Integer xbytes,Integer rbytes){
Packet p;
p.send_buf = xmit;
p.recv_buf = rcv;
@ -540,11 +564,16 @@ public:
p.from_rank= from;
p.do_send = do_send;
p.do_recv = do_recv;
p.bytes = bytes;
p.xbytes = xbytes;
p.rbytes = rbytes;
// if (do_send) std::cout << GridLogMessage << " MPI packet to "<<to<< " of size "<<xbytes<<std::endl;
// if (do_recv) std::cout << GridLogMessage << " MPI packet from "<<from<< " of size "<<xbytes<<std::endl;
Packets.push_back(p);
}
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
Decompress d;
d.partial = this->partialDirichlet;
d.dims = _grid->_fdimensions;
d.kernel_p = k_p;
d.mpi_p = m_p;
d.buffer_size = buffer_size;
@ -552,6 +581,8 @@ public:
}
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m;
m.partial = this->partialDirichlet;
m.dims = _grid->_fdimensions;
m.type = type;
m.mpointer = merge_p;
m.vpointers= rpointers;
@ -571,21 +602,10 @@ public:
void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
{
for(int i=0;i<mm.size();i++){
auto mp = &mm[i].mpointer[0];
auto vp0= &mm[i].vpointers[0][0];
auto vp1= &mm[i].vpointers[1][0];
auto type= mm[i].type;
accelerator_forNB(o,mm[i].buffer_size/2,vobj::Nsimd(),{
decompress.Exchange(mp,vp0,vp1,type,o);
});
decompressor::MergeFace(decompress,mm[i]);
}
for(int i=0;i<dd.size();i++){
auto kp = dd[i].kernel_p;
auto mp = dd[i].mpi_p;
accelerator_forNB(o,dd[i].buffer_size,1,{
decompress.Decompress(kp,mp,o);
});
decompressor::DecompressFace(decompress,dd[i]);
}
}
////////////////////////////////////////
@ -664,6 +684,10 @@ public:
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
}
if ( partialDirichlet ) {
this->_comms_partial_send[ii] = !this->_comms_send[ii];
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
}
}
}
}
@ -691,7 +715,7 @@ public:
this->same_node.resize(npoints);
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up
_unified_buffer_size=0;
@ -827,7 +851,7 @@ public:
GridBase *grid=_grid;
const int Nsimd = grid->Nsimd();
int comms_recv = this->_comms_recv[point];
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@ -1014,8 +1038,10 @@ public:
{
typedef typename cobj::vector_type vector_type;
int comms_send = this->_comms_send[point] ;
int comms_recv = this->_comms_recv[point] ;
int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ;
assert(rhs.Grid()==_grid);
// conformable(_grid,rhs.Grid());
@ -1046,7 +1072,17 @@ public:
if (cbmask != 0x3) words=words>>1;
int bytes = words * compress.CommDatumSize();
int xbytes;
int rbytes;
if ( comms_send ) xbytes = bytes; // Full send
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; // full dirichlet
if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0;
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int comm_off = u_comm_offset;
@ -1059,49 +1095,47 @@ public:
assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank());
if( comms_send ) {
if ( !face_table_computed ) {
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
if ( !face_table_computed ) {
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
// std::cout << "bytes expect "<< bytes << " " << face_table_host.size()* compress.CommDatumSize()<<std::endl;
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
send_buf = this->u_send_buf_p; // Gather locally, must send
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
assert(send_buf!=NULL);
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
}
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
if ( (!duplicate) ) { // Force comms for now
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
send_buf = this->u_send_buf_p; // Gather locally, must send
assert(send_buf!=NULL);
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
if ( !duplicate ) { // Force comms for now
///////////////////////////////////////////////////////////
// Build a list of things to do after we synchronise GPUs
// Start comms now???
///////////////////////////////////////////////////////////
AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off],
xmit_to_rank, comms_send,
recv_from_rank, comms_recv,
bytes);
xmit_to_rank, comms_send|comms_partial_send,
recv_from_rank, comms_recv|comms_partial_recv,
xbytes,rbytes);
}
if ( compress.DecompressionStep() && comms_recv ) {
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[comm_off],
words,Decompressions);
@ -1109,7 +1143,6 @@ public:
u_comm_offset+=words;
face_idx++;
}
}
return 0;
@ -1122,8 +1155,10 @@ public:
const int maxl =2;// max layout in a direction
int comms_send = this->_comms_send[point] ;
int comms_recv = this->_comms_recv[point] ;
int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@ -1153,6 +1188,11 @@ public:
int datum_bytes = compress.CommDatumSize();
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
// how many bytes on wire : partial dirichlet or dirichlet may set to < bytes
int xbytes;
int rbytes;
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
Vector<cobj *> rpointers(maxl);
@ -1182,22 +1222,37 @@ public:
if ( !face_table_computed ) {
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
if ( comms_send || comms_recv ) {
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
if ( comms_send ) xbytes = bytes;
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0;
if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0;
// Gathers SIMD lanes for send and merge
// Different faces can be full comms or partial comms with multiple ranks per node
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
int partial = partialDirichlet;
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
spointers,dimension,sx,cbmask,
compress,permute_type,partial );
}
face_idx++;
//spointers[0] -- low
//spointers[1] -- high
//spointers[0] -- low simd coor
//spointers[1] -- high simd coor
for(int i=0;i<maxl;i++){
int my_coor = rd*i + x; // self explanatory
@ -1222,13 +1277,16 @@ public:
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
rpointers[i] = rp;
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask);
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
if ( !duplicate ) {
if ( (bytes != rbytes) && (rbytes!=0) ){
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
}
AddPacket((void *)sp,(void *)rp,
xmit_to_rank,comms_send,
recv_from_rank,comms_recv,
bytes);
xmit_to_rank,comms_send|comms_partial_send,
recv_from_rank,comms_recv|comms_partial_recv,
xbytes,rbytes);
}
} else {
@ -1238,7 +1296,7 @@ public:
}
}
if ( comms_recv ) {
if ( comms_recv|comms_partial_recv ) {
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
}