mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-20 16:56:55 +01:00
Remove partial dirichlet. Favour intro reduced prec comms options
This commit is contained in:
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
// These can move into a params header and be given MacroMagic serialisation
|
||||
struct DefaultImplParams {
|
||||
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
||||
int partialDirichlet;
|
||||
// int partialDirichlet;
|
||||
DefaultImplParams() {
|
||||
dirichlet.resize(0);
|
||||
partialDirichlet=0;
|
||||
// partialDirichlet=0;
|
||||
};
|
||||
};
|
||||
|
||||
@ -113,8 +113,8 @@ class CartesianStencilAccelerator {
|
||||
///////////////////////////////////////////////////
|
||||
// If true, this is partially communicated per face
|
||||
///////////////////////////////////////////////////
|
||||
StencilVector _comms_partial_send;
|
||||
StencilVector _comms_partial_recv;
|
||||
// StencilVector _comms_partial_send;
|
||||
// StencilVector _comms_partial_recv;
|
||||
//
|
||||
StencilVector _comm_buf_size;
|
||||
StencilVector _permute_type;
|
||||
@ -205,6 +205,8 @@ public:
|
||||
struct Packet {
|
||||
void * send_buf;
|
||||
void * recv_buf;
|
||||
void * compressed_send_buf;
|
||||
void * compressed_recv_buf;
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||
@ -215,6 +217,8 @@ public:
|
||||
Integer do_recv;
|
||||
Integer xbytes;
|
||||
Integer rbytes;
|
||||
Integer xbytes_compressed;
|
||||
Integer rbytes_compressed;
|
||||
};
|
||||
struct Merge {
|
||||
static constexpr int Nsimd = vobj::Nsimd();
|
||||
@ -223,7 +227,7 @@ public:
|
||||
std::vector<cobj *> vpointers;
|
||||
Integer buffer_size;
|
||||
Integer type;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
// Integer partial; // partial dirichlet BCs
|
||||
Coordinate dims;
|
||||
};
|
||||
struct Decompress {
|
||||
@ -231,7 +235,7 @@ public:
|
||||
cobj * kernel_p;
|
||||
cobj * mpi_p;
|
||||
Integer buffer_size;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
// Integer partial; // partial dirichlet BCs
|
||||
Coordinate dims;
|
||||
};
|
||||
struct CopyReceiveBuffer {
|
||||
@ -255,6 +259,12 @@ protected:
|
||||
public:
|
||||
GridBase *Grid(void) const { return _grid; }
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// Control reduced precision comms
|
||||
/////////////////////////////////////////////////////////
|
||||
int SloppyComms;
|
||||
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||
// without adding parameters. Perhaps a template parameter to StenciView is
|
||||
@ -268,7 +278,7 @@ public:
|
||||
}
|
||||
|
||||
int face_table_computed;
|
||||
int partialDirichlet;
|
||||
// int partialDirichlet;
|
||||
int fullDirichlet;
|
||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||
deviceVector<int> surface_list;
|
||||
@ -361,6 +371,118 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Non blocking send and receive. Necessarily parallel.
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void DecompressPacket(Packet &packet)
|
||||
{
|
||||
if ( !SloppyComms ) return;
|
||||
|
||||
if ( packet.do_recv ) {
|
||||
|
||||
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||
uint64_t words = packet.rbytes/sizeof(word);
|
||||
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||
const uint64_t outer = words/nsimd;
|
||||
|
||||
if(sizeof(word)==8) {
|
||||
|
||||
// Can either choose to represent as float vs double and prec change
|
||||
// OR
|
||||
// truncate the mantissa bfp16 style
|
||||
|
||||
static deviceVector<uint32_t> compression_buffer;
|
||||
|
||||
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||
|
||||
uint64_t *fbuf =(uint64_t *) &packet.recv_buf;
|
||||
uint32_t *fhbuf=(uint32_t *) &packet.recv_buf;
|
||||
uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision
|
||||
});
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes
|
||||
});
|
||||
} else if ( sizeof(word)==4){
|
||||
// Can either choose to represent as half vs float and prec change
|
||||
// OR
|
||||
// truncate the mantissa bfp16 style
|
||||
static deviceVector<uint16_t> compression_buffer;
|
||||
|
||||
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||
|
||||
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
|
||||
uint16_t *fhbuf=(uint16_t *) packet.recv_buf;
|
||||
uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane];
|
||||
});
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
|
||||
});
|
||||
} else {
|
||||
assert(0 && "unknown floating point precision");
|
||||
}
|
||||
}
|
||||
}
|
||||
void CompressPacket(Packet &packet)
|
||||
{
|
||||
if ( !SloppyComms ) {
|
||||
packet.xbytes_compressed = packet.xbytes;
|
||||
packet.rbytes_compressed = packet.rbytes;
|
||||
return;
|
||||
}
|
||||
|
||||
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||
uint64_t words = packet.xbytes/sizeof(word);
|
||||
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||
const uint64_t outer = words/nsimd;
|
||||
|
||||
if (packet.do_send) {
|
||||
|
||||
if(sizeof(word)==8) {
|
||||
|
||||
static deviceVector<uint32_t> compression_buffer;
|
||||
|
||||
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||
|
||||
uint64_t *fbuf =(uint64_t *) packet.send_buf;
|
||||
uint32_t *fhbuf=(uint32_t *) packet.send_buf;
|
||||
uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
|
||||
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy
|
||||
});
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back
|
||||
});
|
||||
|
||||
} else if ( sizeof(word)==4){
|
||||
|
||||
static deviceVector<uint16_t> compression_buffer;
|
||||
|
||||
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||
|
||||
uint32_t *fbuf =(uint32_t *) packet.send_buf;
|
||||
uint16_t *fhbuf=(uint16_t *) packet.send_buf;
|
||||
uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16;
|
||||
});
|
||||
accelerator_for(ss,outer,nsimd,{
|
||||
fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane];
|
||||
});
|
||||
} else {
|
||||
assert(0 && "unknown floating point precision");
|
||||
}
|
||||
}
|
||||
packet.xbytes_compressed = packet.xbytes/2;
|
||||
packet.rbytes_compressed = packet.rbytes/2;
|
||||
|
||||
return;
|
||||
}
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
// std::cout << "Communicate Begin "<<std::endl;
|
||||
@ -371,6 +493,9 @@ public:
|
||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||
// But the HaloGather had a barrier too.
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
|
||||
this->CompressPacket(Packets[i]);
|
||||
|
||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||
@ -378,7 +503,7 @@ public:
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||
}
|
||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
@ -394,14 +519,14 @@ public:
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||
}
|
||||
FlightRecorder::StepLog("Communicate begin has finished");
|
||||
// Get comms started then run checksums
|
||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if ( Packets[i].do_send )
|
||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes_compressed);
|
||||
}
|
||||
}
|
||||
|
||||
@ -416,14 +541,15 @@ public:
|
||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||
if ( this->partialDirichlet ) DslashLogPartial();
|
||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
// if ( this->partialDirichlet ) DslashLogPartial();
|
||||
if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
else DslashLogFull();
|
||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||
// accelerator_barrier();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if ( Packets[i].do_recv )
|
||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
|
||||
this->DecompressPacket(Packets[i]);
|
||||
}
|
||||
FlightRecorder::StepLog("Finish communicate complete");
|
||||
}
|
||||
@ -618,7 +744,7 @@ public:
|
||||
}
|
||||
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
||||
Decompress d;
|
||||
d.partial = this->partialDirichlet;
|
||||
// d.partial = this->partialDirichlet;
|
||||
d.dims = _grid->_fdimensions;
|
||||
d.kernel_p = k_p;
|
||||
d.mpi_p = m_p;
|
||||
@ -627,7 +753,7 @@ public:
|
||||
}
|
||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
Merge m;
|
||||
m.partial = this->partialDirichlet;
|
||||
// m.partial = this->partialDirichlet;
|
||||
m.dims = _grid->_fdimensions;
|
||||
m.type = type;
|
||||
m.mpointer = merge_p;
|
||||
@ -732,8 +858,8 @@ public:
|
||||
int block = dirichlet_block[dimension];
|
||||
this->_comms_send[ii] = comm_dim;
|
||||
this->_comms_recv[ii] = comm_dim;
|
||||
this->_comms_partial_send[ii] = 0;
|
||||
this->_comms_partial_recv[ii] = 0;
|
||||
// this->_comms_partial_send[ii] = 0;
|
||||
// this->_comms_partial_recv[ii] = 0;
|
||||
if ( block && comm_dim ) {
|
||||
assert(abs(displacement) < ld );
|
||||
// Quiesce communication across block boundaries
|
||||
@ -754,10 +880,10 @@ public:
|
||||
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
||||
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
||||
}
|
||||
if ( partialDirichlet ) {
|
||||
this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||
}
|
||||
// if ( partialDirichlet ) {
|
||||
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -769,6 +895,7 @@ public:
|
||||
Parameters p=Parameters(),
|
||||
bool preserve_shm=false)
|
||||
{
|
||||
SloppyComms = 0;
|
||||
face_table_computed=0;
|
||||
_grid = grid;
|
||||
this->parameters=p;
|
||||
@ -786,7 +913,7 @@ public:
|
||||
this->same_node.resize(npoints);
|
||||
|
||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||
partialDirichlet = p.partialDirichlet;
|
||||
// partialDirichlet = p.partialDirichlet;
|
||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||
fullDirichlet=0;
|
||||
for(int d=0;d<p.dirichlet.size();d++){
|
||||
@ -935,7 +1062,8 @@ public:
|
||||
GridBase *grid=_grid;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@ -1124,8 +1252,8 @@ public:
|
||||
|
||||
int comms_send = this->_comms_send[point];
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
|
||||
assert(rhs.Grid()==_grid);
|
||||
// conformable(_grid,rhs.Grid());
|
||||
@ -1160,11 +1288,11 @@ public:
|
||||
int rbytes;
|
||||
|
||||
if ( comms_send ) xbytes = bytes; // Full send
|
||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else xbytes = 0; // full dirichlet
|
||||
|
||||
if ( comms_recv ) rbytes = bytes;
|
||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else rbytes = 0;
|
||||
|
||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
@ -1191,7 +1319,8 @@ public:
|
||||
}
|
||||
|
||||
|
||||
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||
if ( compress.DecompressionStep()&&comms_recv) {
|
||||
recv_buf=u_simd_recv_buf[0];
|
||||
} else {
|
||||
recv_buf=this->u_recv_buf_p;
|
||||
@ -1225,7 +1354,8 @@ public:
|
||||
#endif
|
||||
|
||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
|
||||
|
||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
||||
if ( !duplicate ) { // Force comms for now
|
||||
@ -1234,8 +1364,8 @@ public:
|
||||
// Build a list of things to do after we synchronise GPUs
|
||||
// Start comms now???
|
||||
///////////////////////////////////////////////////////////
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||
int do_send = (comms_send) && (!shm_send );
|
||||
int do_recv = (comms_send) && (!shm_recv );
|
||||
AddPacket((void *)&send_buf[comm_off],
|
||||
(void *)&recv_buf[comm_off],
|
||||
xmit_to_rank, do_send,
|
||||
@ -1243,7 +1373,7 @@ public:
|
||||
xbytes,rbytes);
|
||||
}
|
||||
|
||||
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
|
||||
if ( (compress.DecompressionStep() && comms_recv) ) {
|
||||
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||
&recv_buf[comm_off],
|
||||
words,Decompressions);
|
||||
@ -1265,8 +1395,8 @@ public:
|
||||
|
||||
int comms_send = this->_comms_send[point];
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@ -1341,18 +1471,20 @@ public:
|
||||
|
||||
|
||||
if ( comms_send ) xbytes = bytes;
|
||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else xbytes = 0;
|
||||
|
||||
if ( comms_recv ) rbytes = bytes;
|
||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else rbytes = 0;
|
||||
|
||||
// Gathers SIMD lanes for send and merge
|
||||
// Different faces can be full comms or partial comms with multiple ranks per node
|
||||
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||
if ( comms_send || comms_recv ) {
|
||||
|
||||
int partial = partialDirichlet;
|
||||
// int partial = partialDirichlet;
|
||||
int partial = 0;
|
||||
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
||||
spointers,dimension,sx,cbmask,
|
||||
compress,permute_type,partial );
|
||||
@ -1418,7 +1550,8 @@ public:
|
||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||
}
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_send = (comms_send) && (!shm_send );
|
||||
AddPacket((void *)sp,(void *)rp,
|
||||
xmit_to_rank,do_send,
|
||||
recv_from_rank,do_send,
|
||||
@ -1432,7 +1565,8 @@ public:
|
||||
}
|
||||
}
|
||||
// rpointer may be doing a remote read in the gather over SHM
|
||||
if ( comms_recv|comms_partial_recv ) {
|
||||
// if ( comms_recv|comms_partial_recv ) {
|
||||
if ( comms_recv ) {
|
||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user