mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-21 01:02:02 +01:00
Remove partial dirichlet. Favour intro reduced prec comms options
This commit is contained in:
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// These can move into a params header and be given MacroMagic serialisation
|
// These can move into a params header and be given MacroMagic serialisation
|
||||||
struct DefaultImplParams {
|
struct DefaultImplParams {
|
||||||
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
||||||
int partialDirichlet;
|
// int partialDirichlet;
|
||||||
DefaultImplParams() {
|
DefaultImplParams() {
|
||||||
dirichlet.resize(0);
|
dirichlet.resize(0);
|
||||||
partialDirichlet=0;
|
// partialDirichlet=0;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -113,8 +113,8 @@ class CartesianStencilAccelerator {
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// If true, this is partially communicated per face
|
// If true, this is partially communicated per face
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
StencilVector _comms_partial_send;
|
// StencilVector _comms_partial_send;
|
||||||
StencilVector _comms_partial_recv;
|
// StencilVector _comms_partial_recv;
|
||||||
//
|
//
|
||||||
StencilVector _comm_buf_size;
|
StencilVector _comm_buf_size;
|
||||||
StencilVector _permute_type;
|
StencilVector _permute_type;
|
||||||
@ -205,6 +205,8 @@ public:
|
|||||||
struct Packet {
|
struct Packet {
|
||||||
void * send_buf;
|
void * send_buf;
|
||||||
void * recv_buf;
|
void * recv_buf;
|
||||||
|
void * compressed_send_buf;
|
||||||
|
void * compressed_recv_buf;
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||||
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||||
@ -215,6 +217,8 @@ public:
|
|||||||
Integer do_recv;
|
Integer do_recv;
|
||||||
Integer xbytes;
|
Integer xbytes;
|
||||||
Integer rbytes;
|
Integer rbytes;
|
||||||
|
Integer xbytes_compressed;
|
||||||
|
Integer rbytes_compressed;
|
||||||
};
|
};
|
||||||
struct Merge {
|
struct Merge {
|
||||||
static constexpr int Nsimd = vobj::Nsimd();
|
static constexpr int Nsimd = vobj::Nsimd();
|
||||||
@ -223,7 +227,7 @@ public:
|
|||||||
std::vector<cobj *> vpointers;
|
std::vector<cobj *> vpointers;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer type;
|
Integer type;
|
||||||
Integer partial; // partial dirichlet BCs
|
// Integer partial; // partial dirichlet BCs
|
||||||
Coordinate dims;
|
Coordinate dims;
|
||||||
};
|
};
|
||||||
struct Decompress {
|
struct Decompress {
|
||||||
@ -231,7 +235,7 @@ public:
|
|||||||
cobj * kernel_p;
|
cobj * kernel_p;
|
||||||
cobj * mpi_p;
|
cobj * mpi_p;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer partial; // partial dirichlet BCs
|
// Integer partial; // partial dirichlet BCs
|
||||||
Coordinate dims;
|
Coordinate dims;
|
||||||
};
|
};
|
||||||
struct CopyReceiveBuffer {
|
struct CopyReceiveBuffer {
|
||||||
@ -255,6 +259,12 @@ protected:
|
|||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// Control reduced precision comms
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
int SloppyComms;
|
||||||
|
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||||
// without adding parameters. Perhaps a template parameter to StenciView is
|
// without adding parameters. Perhaps a template parameter to StenciView is
|
||||||
@ -268,7 +278,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
int partialDirichlet;
|
// int partialDirichlet;
|
||||||
int fullDirichlet;
|
int fullDirichlet;
|
||||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||||
deviceVector<int> surface_list;
|
deviceVector<int> surface_list;
|
||||||
@ -361,6 +371,118 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Non blocking send and receive. Necessarily parallel.
|
// Non blocking send and receive. Necessarily parallel.
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
void DecompressPacket(Packet &packet)
|
||||||
|
{
|
||||||
|
if ( !SloppyComms ) return;
|
||||||
|
|
||||||
|
if ( packet.do_recv ) {
|
||||||
|
|
||||||
|
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||||
|
uint64_t words = packet.rbytes/sizeof(word);
|
||||||
|
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||||
|
const uint64_t outer = words/nsimd;
|
||||||
|
|
||||||
|
if(sizeof(word)==8) {
|
||||||
|
|
||||||
|
// Can either choose to represent as float vs double and prec change
|
||||||
|
// OR
|
||||||
|
// truncate the mantissa bfp16 style
|
||||||
|
|
||||||
|
static deviceVector<uint32_t> compression_buffer;
|
||||||
|
|
||||||
|
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||||
|
|
||||||
|
uint64_t *fbuf =(uint64_t *) &packet.recv_buf;
|
||||||
|
uint32_t *fhbuf=(uint32_t *) &packet.recv_buf;
|
||||||
|
uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision
|
||||||
|
});
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes
|
||||||
|
});
|
||||||
|
} else if ( sizeof(word)==4){
|
||||||
|
// Can either choose to represent as half vs float and prec change
|
||||||
|
// OR
|
||||||
|
// truncate the mantissa bfp16 style
|
||||||
|
static deviceVector<uint16_t> compression_buffer;
|
||||||
|
|
||||||
|
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||||
|
|
||||||
|
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
|
||||||
|
uint16_t *fhbuf=(uint16_t *) packet.recv_buf;
|
||||||
|
uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane];
|
||||||
|
});
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
assert(0 && "unknown floating point precision");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CompressPacket(Packet &packet)
|
||||||
|
{
|
||||||
|
if ( !SloppyComms ) {
|
||||||
|
packet.xbytes_compressed = packet.xbytes;
|
||||||
|
packet.rbytes_compressed = packet.rbytes;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||||
|
uint64_t words = packet.xbytes/sizeof(word);
|
||||||
|
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||||
|
const uint64_t outer = words/nsimd;
|
||||||
|
|
||||||
|
if (packet.do_send) {
|
||||||
|
|
||||||
|
if(sizeof(word)==8) {
|
||||||
|
|
||||||
|
static deviceVector<uint32_t> compression_buffer;
|
||||||
|
|
||||||
|
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||||
|
|
||||||
|
uint64_t *fbuf =(uint64_t *) packet.send_buf;
|
||||||
|
uint32_t *fhbuf=(uint32_t *) packet.send_buf;
|
||||||
|
uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
|
||||||
|
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy
|
||||||
|
});
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back
|
||||||
|
});
|
||||||
|
|
||||||
|
} else if ( sizeof(word)==4){
|
||||||
|
|
||||||
|
static deviceVector<uint16_t> compression_buffer;
|
||||||
|
|
||||||
|
if(words > compression_buffer.size() ) compression_buffer.resize(words);
|
||||||
|
|
||||||
|
uint32_t *fbuf =(uint32_t *) packet.send_buf;
|
||||||
|
uint16_t *fhbuf=(uint16_t *) packet.send_buf;
|
||||||
|
uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16;
|
||||||
|
});
|
||||||
|
accelerator_for(ss,outer,nsimd,{
|
||||||
|
fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane];
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
assert(0 && "unknown floating point precision");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
packet.xbytes_compressed = packet.xbytes/2;
|
||||||
|
packet.rbytes_compressed = packet.rbytes/2;
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// std::cout << "Communicate Begin "<<std::endl;
|
// std::cout << "Communicate Begin "<<std::endl;
|
||||||
@ -371,6 +493,9 @@ public:
|
|||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
// But the HaloGather had a barrier too.
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
|
||||||
|
this->CompressPacket(Packets[i]);
|
||||||
|
|
||||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||||
@ -378,7 +503,7 @@ public:
|
|||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||||
}
|
}
|
||||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
@ -394,14 +519,14 @@ public:
|
|||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||||
}
|
}
|
||||||
FlightRecorder::StepLog("Communicate begin has finished");
|
FlightRecorder::StepLog("Communicate begin has finished");
|
||||||
// Get comms started then run checksums
|
// Get comms started then run checksums
|
||||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_send )
|
if ( Packets[i].do_send )
|
||||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes_compressed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -416,14 +541,15 @@ public:
|
|||||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
// if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_recv )
|
if ( Packets[i].do_recv )
|
||||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
|
||||||
|
this->DecompressPacket(Packets[i]);
|
||||||
}
|
}
|
||||||
FlightRecorder::StepLog("Finish communicate complete");
|
FlightRecorder::StepLog("Finish communicate complete");
|
||||||
}
|
}
|
||||||
@ -618,7 +744,7 @@ public:
|
|||||||
}
|
}
|
||||||
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
||||||
Decompress d;
|
Decompress d;
|
||||||
d.partial = this->partialDirichlet;
|
// d.partial = this->partialDirichlet;
|
||||||
d.dims = _grid->_fdimensions;
|
d.dims = _grid->_fdimensions;
|
||||||
d.kernel_p = k_p;
|
d.kernel_p = k_p;
|
||||||
d.mpi_p = m_p;
|
d.mpi_p = m_p;
|
||||||
@ -627,7 +753,7 @@ public:
|
|||||||
}
|
}
|
||||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||||
Merge m;
|
Merge m;
|
||||||
m.partial = this->partialDirichlet;
|
// m.partial = this->partialDirichlet;
|
||||||
m.dims = _grid->_fdimensions;
|
m.dims = _grid->_fdimensions;
|
||||||
m.type = type;
|
m.type = type;
|
||||||
m.mpointer = merge_p;
|
m.mpointer = merge_p;
|
||||||
@ -732,8 +858,8 @@ public:
|
|||||||
int block = dirichlet_block[dimension];
|
int block = dirichlet_block[dimension];
|
||||||
this->_comms_send[ii] = comm_dim;
|
this->_comms_send[ii] = comm_dim;
|
||||||
this->_comms_recv[ii] = comm_dim;
|
this->_comms_recv[ii] = comm_dim;
|
||||||
this->_comms_partial_send[ii] = 0;
|
// this->_comms_partial_send[ii] = 0;
|
||||||
this->_comms_partial_recv[ii] = 0;
|
// this->_comms_partial_recv[ii] = 0;
|
||||||
if ( block && comm_dim ) {
|
if ( block && comm_dim ) {
|
||||||
assert(abs(displacement) < ld );
|
assert(abs(displacement) < ld );
|
||||||
// Quiesce communication across block boundaries
|
// Quiesce communication across block boundaries
|
||||||
@ -754,10 +880,10 @@ public:
|
|||||||
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
||||||
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
||||||
}
|
}
|
||||||
if ( partialDirichlet ) {
|
// if ( partialDirichlet ) {
|
||||||
this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||||
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -769,6 +895,7 @@ public:
|
|||||||
Parameters p=Parameters(),
|
Parameters p=Parameters(),
|
||||||
bool preserve_shm=false)
|
bool preserve_shm=false)
|
||||||
{
|
{
|
||||||
|
SloppyComms = 0;
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
this->parameters=p;
|
this->parameters=p;
|
||||||
@ -786,7 +913,7 @@ public:
|
|||||||
this->same_node.resize(npoints);
|
this->same_node.resize(npoints);
|
||||||
|
|
||||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||||
partialDirichlet = p.partialDirichlet;
|
// partialDirichlet = p.partialDirichlet;
|
||||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||||
fullDirichlet=0;
|
fullDirichlet=0;
|
||||||
for(int d=0;d<p.dirichlet.size();d++){
|
for(int d=0;d<p.dirichlet.size();d++){
|
||||||
@ -935,7 +1062,8 @@ public:
|
|||||||
GridBase *grid=_grid;
|
GridBase *grid=_grid;
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||||
|
int comms_recv = this->_comms_recv[point];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int ld = _grid->_ldimensions[dimension];
|
int ld = _grid->_ldimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -1124,8 +1252,8 @@ public:
|
|||||||
|
|
||||||
int comms_send = this->_comms_send[point];
|
int comms_send = this->_comms_send[point];
|
||||||
int comms_recv = this->_comms_recv[point];
|
int comms_recv = this->_comms_recv[point];
|
||||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||||
|
|
||||||
assert(rhs.Grid()==_grid);
|
assert(rhs.Grid()==_grid);
|
||||||
// conformable(_grid,rhs.Grid());
|
// conformable(_grid,rhs.Grid());
|
||||||
@ -1160,11 +1288,11 @@ public:
|
|||||||
int rbytes;
|
int rbytes;
|
||||||
|
|
||||||
if ( comms_send ) xbytes = bytes; // Full send
|
if ( comms_send ) xbytes = bytes; // Full send
|
||||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else xbytes = 0; // full dirichlet
|
else xbytes = 0; // full dirichlet
|
||||||
|
|
||||||
if ( comms_recv ) rbytes = bytes;
|
if ( comms_recv ) rbytes = bytes;
|
||||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else rbytes = 0;
|
else rbytes = 0;
|
||||||
|
|
||||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||||
@ -1191,7 +1319,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||||
|
if ( compress.DecompressionStep()&&comms_recv) {
|
||||||
recv_buf=u_simd_recv_buf[0];
|
recv_buf=u_simd_recv_buf[0];
|
||||||
} else {
|
} else {
|
||||||
recv_buf=this->u_recv_buf_p;
|
recv_buf=this->u_recv_buf_p;
|
||||||
@ -1225,7 +1354,8 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||||
|
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
|
||||||
|
|
||||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
||||||
if ( !duplicate ) { // Force comms for now
|
if ( !duplicate ) { // Force comms for now
|
||||||
@ -1234,8 +1364,8 @@ public:
|
|||||||
// Build a list of things to do after we synchronise GPUs
|
// Build a list of things to do after we synchronise GPUs
|
||||||
// Start comms now???
|
// Start comms now???
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
int do_send = (comms_send) && (!shm_send );
|
||||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
int do_recv = (comms_send) && (!shm_recv );
|
||||||
AddPacket((void *)&send_buf[comm_off],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[comm_off],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, do_send,
|
xmit_to_rank, do_send,
|
||||||
@ -1243,7 +1373,7 @@ public:
|
|||||||
xbytes,rbytes);
|
xbytes,rbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
|
if ( (compress.DecompressionStep() && comms_recv) ) {
|
||||||
AddDecompress(&this->u_recv_buf_p[comm_off],
|
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||||
&recv_buf[comm_off],
|
&recv_buf[comm_off],
|
||||||
words,Decompressions);
|
words,Decompressions);
|
||||||
@ -1265,8 +1395,8 @@ public:
|
|||||||
|
|
||||||
int comms_send = this->_comms_send[point];
|
int comms_send = this->_comms_send[point];
|
||||||
int comms_recv = this->_comms_recv[point];
|
int comms_recv = this->_comms_recv[point];
|
||||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -1341,18 +1471,20 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
if ( comms_send ) xbytes = bytes;
|
if ( comms_send ) xbytes = bytes;
|
||||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else xbytes = 0;
|
else xbytes = 0;
|
||||||
|
|
||||||
if ( comms_recv ) rbytes = bytes;
|
if ( comms_recv ) rbytes = bytes;
|
||||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else rbytes = 0;
|
else rbytes = 0;
|
||||||
|
|
||||||
// Gathers SIMD lanes for send and merge
|
// Gathers SIMD lanes for send and merge
|
||||||
// Different faces can be full comms or partial comms with multiple ranks per node
|
// Different faces can be full comms or partial comms with multiple ranks per node
|
||||||
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||||
|
if ( comms_send || comms_recv ) {
|
||||||
|
|
||||||
int partial = partialDirichlet;
|
// int partial = partialDirichlet;
|
||||||
|
int partial = 0;
|
||||||
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
||||||
spointers,dimension,sx,cbmask,
|
spointers,dimension,sx,cbmask,
|
||||||
compress,permute_type,partial );
|
compress,permute_type,partial );
|
||||||
@ -1418,7 +1550,8 @@ public:
|
|||||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||||
}
|
}
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
|
int do_send = (comms_send) && (!shm_send );
|
||||||
AddPacket((void *)sp,(void *)rp,
|
AddPacket((void *)sp,(void *)rp,
|
||||||
xmit_to_rank,do_send,
|
xmit_to_rank,do_send,
|
||||||
recv_from_rank,do_send,
|
recv_from_rank,do_send,
|
||||||
@ -1432,7 +1565,8 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// rpointer may be doing a remote read in the gather over SHM
|
// rpointer may be doing a remote read in the gather over SHM
|
||||||
if ( comms_recv|comms_partial_recv ) {
|
// if ( comms_recv|comms_partial_recv ) {
|
||||||
|
if ( comms_recv ) {
|
||||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user