1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-21 01:02:02 +01:00

Remove partial dirichlet. Favour intro reduced prec comms options

This commit is contained in:
Peter Boyle
2025-06-13 05:08:45 +02:00
parent fce6e1f135
commit 821358eda7

View File

@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct DefaultImplParams { struct DefaultImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet; // int partialDirichlet;
DefaultImplParams() { DefaultImplParams() {
dirichlet.resize(0); dirichlet.resize(0);
partialDirichlet=0; // partialDirichlet=0;
}; };
}; };
@ -113,8 +113,8 @@ class CartesianStencilAccelerator {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// If true, this is partially communicated per face // If true, this is partially communicated per face
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
StencilVector _comms_partial_send; // StencilVector _comms_partial_send;
StencilVector _comms_partial_recv; // StencilVector _comms_partial_recv;
// //
StencilVector _comm_buf_size; StencilVector _comm_buf_size;
StencilVector _permute_type; StencilVector _permute_type;
@ -205,6 +205,8 @@ public:
struct Packet { struct Packet {
void * send_buf; void * send_buf;
void * recv_buf; void * recv_buf;
void * compressed_send_buf;
void * compressed_recv_buf;
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
@ -215,6 +217,8 @@ public:
Integer do_recv; Integer do_recv;
Integer xbytes; Integer xbytes;
Integer rbytes; Integer rbytes;
Integer xbytes_compressed;
Integer rbytes_compressed;
}; };
struct Merge { struct Merge {
static constexpr int Nsimd = vobj::Nsimd(); static constexpr int Nsimd = vobj::Nsimd();
@ -223,7 +227,7 @@ public:
std::vector<cobj *> vpointers; std::vector<cobj *> vpointers;
Integer buffer_size; Integer buffer_size;
Integer type; Integer type;
Integer partial; // partial dirichlet BCs // Integer partial; // partial dirichlet BCs
Coordinate dims; Coordinate dims;
}; };
struct Decompress { struct Decompress {
@ -231,7 +235,7 @@ public:
cobj * kernel_p; cobj * kernel_p;
cobj * mpi_p; cobj * mpi_p;
Integer buffer_size; Integer buffer_size;
Integer partial; // partial dirichlet BCs // Integer partial; // partial dirichlet BCs
Coordinate dims; Coordinate dims;
}; };
struct CopyReceiveBuffer { struct CopyReceiveBuffer {
@ -255,6 +259,12 @@ protected:
public: public:
GridBase *Grid(void) const { return _grid; } GridBase *Grid(void) const { return _grid; }
/////////////////////////////////////////////////////////
// Control reduced precision comms
/////////////////////////////////////////////////////////
int SloppyComms;
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Needed to conveniently communicate gparity parameters into GPU memory // Needed to conveniently communicate gparity parameters into GPU memory
// without adding parameters. Perhaps a template parameter to StenciView is // without adding parameters. Perhaps a template parameter to StenciView is
@ -268,7 +278,7 @@ public:
} }
int face_table_computed; int face_table_computed;
int partialDirichlet; // int partialDirichlet;
int fullDirichlet; int fullDirichlet;
std::vector<deviceVector<std::pair<int,int> > > face_table ; std::vector<deviceVector<std::pair<int,int> > > face_table ;
deviceVector<int> surface_list; deviceVector<int> surface_list;
@ -361,6 +371,118 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Non blocking send and receive. Necessarily parallel. // Non blocking send and receive. Necessarily parallel.
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void DecompressPacket(Packet &packet)
{
if ( !SloppyComms ) return;
if ( packet.do_recv ) {
typedef typename getPrecision<cobj>::real_scalar_type word;
uint64_t words = packet.rbytes/sizeof(word);
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
const uint64_t outer = words/nsimd;
if(sizeof(word)==8) {
// Can either choose to represent as float vs double and prec change
// OR
// truncate the mantissa bfp16 style
static deviceVector<uint32_t> compression_buffer;
if(words > compression_buffer.size() ) compression_buffer.resize(words);
uint64_t *fbuf =(uint64_t *) &packet.recv_buf;
uint32_t *fhbuf=(uint32_t *) &packet.recv_buf;
uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
accelerator_for(ss,outer,nsimd,{
hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision
});
accelerator_for(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes
});
} else if ( sizeof(word)==4){
// Can either choose to represent as half vs float and prec change
// OR
// truncate the mantissa bfp16 style
static deviceVector<uint16_t> compression_buffer;
if(words > compression_buffer.size() ) compression_buffer.resize(words);
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
uint16_t *fhbuf=(uint16_t *) packet.recv_buf;
uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
accelerator_for(ss,outer,nsimd,{
hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane];
});
accelerator_for(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
});
} else {
assert(0 && "unknown floating point precision");
}
}
}
void CompressPacket(Packet &packet)
{
if ( !SloppyComms ) {
packet.xbytes_compressed = packet.xbytes;
packet.rbytes_compressed = packet.rbytes;
return;
}
typedef typename getPrecision<cobj>::real_scalar_type word;
uint64_t words = packet.xbytes/sizeof(word);
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
const uint64_t outer = words/nsimd;
if (packet.do_send) {
if(sizeof(word)==8) {
static deviceVector<uint32_t> compression_buffer;
if(words > compression_buffer.size() ) compression_buffer.resize(words);
uint64_t *fbuf =(uint64_t *) packet.send_buf;
uint32_t *fhbuf=(uint32_t *) packet.send_buf;
uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
accelerator_for(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy
});
accelerator_for(ss,outer,nsimd,{
fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back
});
} else if ( sizeof(word)==4){
static deviceVector<uint16_t> compression_buffer;
if(words > compression_buffer.size() ) compression_buffer.resize(words);
uint32_t *fbuf =(uint32_t *) packet.send_buf;
uint16_t *fhbuf=(uint16_t *) packet.send_buf;
uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
accelerator_for(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16;
});
accelerator_for(ss,outer,nsimd,{
fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane];
});
} else {
assert(0 && "unknown floating point precision");
}
}
packet.xbytes_compressed = packet.xbytes/2;
packet.rbytes_compressed = packet.rbytes/2;
return;
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
// std::cout << "Communicate Begin "<<std::endl; // std::cout << "Communicate Begin "<<std::endl;
@ -371,6 +493,9 @@ public:
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too. // But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
this->CompressPacket(Packets[i]);
// std::cout << "Communicate prepare "<<i<<std::endl; // std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier(); // _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs, _grid->StencilSendToRecvFromPrepare(MpiReqs,
@ -378,7 +503,7 @@ public:
Packets[i].to_rank,Packets[i].do_send, Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i); Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
} }
// std::cout << "Communicate PollDtoH "<<std::endl; // std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier(); // _grid->Barrier();
@ -394,14 +519,14 @@ public:
Packets[i].to_rank,Packets[i].do_send, Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i); Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
} }
FlightRecorder::StepLog("Communicate begin has finished"); FlightRecorder::StepLog("Communicate begin has finished");
// Get comms started then run checksums // Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!) // Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_send ) if ( Packets[i].do_send )
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes); FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes_compressed);
} }
} }
@ -416,14 +541,15 @@ public:
// std::cout << "Communicate Complete Complete "<<std::endl; // std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier(); // _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial(); // if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet(); if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull(); else DslashLogFull();
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// accelerator_barrier(); // accelerator_barrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
this->DecompressPacket(Packets[i]);
} }
FlightRecorder::StepLog("Finish communicate complete"); FlightRecorder::StepLog("Finish communicate complete");
} }
@ -618,7 +744,7 @@ public:
} }
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) { void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
Decompress d; Decompress d;
d.partial = this->partialDirichlet; // d.partial = this->partialDirichlet;
d.dims = _grid->_fdimensions; d.dims = _grid->_fdimensions;
d.kernel_p = k_p; d.kernel_p = k_p;
d.mpi_p = m_p; d.mpi_p = m_p;
@ -627,7 +753,7 @@ public:
} }
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m; Merge m;
m.partial = this->partialDirichlet; // m.partial = this->partialDirichlet;
m.dims = _grid->_fdimensions; m.dims = _grid->_fdimensions;
m.type = type; m.type = type;
m.mpointer = merge_p; m.mpointer = merge_p;
@ -732,8 +858,8 @@ public:
int block = dirichlet_block[dimension]; int block = dirichlet_block[dimension];
this->_comms_send[ii] = comm_dim; this->_comms_send[ii] = comm_dim;
this->_comms_recv[ii] = comm_dim; this->_comms_recv[ii] = comm_dim;
this->_comms_partial_send[ii] = 0; // this->_comms_partial_send[ii] = 0;
this->_comms_partial_recv[ii] = 0; // this->_comms_partial_recv[ii] = 0;
if ( block && comm_dim ) { if ( block && comm_dim ) {
assert(abs(displacement) < ld ); assert(abs(displacement) < ld );
// Quiesce communication across block boundaries // Quiesce communication across block boundaries
@ -754,10 +880,10 @@ public:
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
} }
if ( partialDirichlet ) { // if ( partialDirichlet ) {
this->_comms_partial_send[ii] = !this->_comms_send[ii]; // this->_comms_partial_send[ii] = !this->_comms_send[ii];
this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; // this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
} // }
} }
} }
} }
@ -769,6 +895,7 @@ public:
Parameters p=Parameters(), Parameters p=Parameters(),
bool preserve_shm=false) bool preserve_shm=false)
{ {
SloppyComms = 0;
face_table_computed=0; face_table_computed=0;
_grid = grid; _grid = grid;
this->parameters=p; this->parameters=p;
@ -786,7 +913,7 @@ public:
this->same_node.resize(npoints); this->same_node.resize(npoints);
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
partialDirichlet = p.partialDirichlet; // partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up DirichletBlock(p.dirichlet); // comms send/recv set up
fullDirichlet=0; fullDirichlet=0;
for(int d=0;d<p.dirichlet.size();d++){ for(int d=0;d<p.dirichlet.size();d++){
@ -935,7 +1062,8 @@ public:
GridBase *grid=_grid; GridBase *grid=_grid;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; // int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
int comms_recv = this->_comms_recv[point];
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension]; int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@ -1124,8 +1252,8 @@ public:
int comms_send = this->_comms_send[point]; int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point]; int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ; // int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ; // int comms_partial_recv = this->_comms_partial_recv[point] ;
assert(rhs.Grid()==_grid); assert(rhs.Grid()==_grid);
// conformable(_grid,rhs.Grid()); // conformable(_grid,rhs.Grid());
@ -1160,11 +1288,11 @@ public:
int rbytes; int rbytes;
if ( comms_send ) xbytes = bytes; // Full send if ( comms_send ) xbytes = bytes; // Full send
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); // else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; // full dirichlet else xbytes = 0; // full dirichlet
if ( comms_recv ) rbytes = bytes; if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); // else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0; else rbytes = 0;
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@ -1191,7 +1319,8 @@ public:
} }
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { // if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
if ( compress.DecompressionStep()&&comms_recv) {
recv_buf=u_simd_recv_buf[0]; recv_buf=u_simd_recv_buf[0];
} else { } else {
recv_buf=this->u_recv_buf_p; recv_buf=this->u_recv_buf_p;
@ -1225,7 +1354,8 @@ public:
#endif #endif
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl; // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send); // compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask); int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
if ( !duplicate ) { // Force comms for now if ( !duplicate ) { // Force comms for now
@ -1234,8 +1364,8 @@ public:
// Build a list of things to do after we synchronise GPUs // Build a list of things to do after we synchronise GPUs
// Start comms now??? // Start comms now???
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
int do_send = (comms_send|comms_partial_send) && (!shm_send ); int do_send = (comms_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); int do_recv = (comms_send) && (!shm_recv );
AddPacket((void *)&send_buf[comm_off], AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off], (void *)&recv_buf[comm_off],
xmit_to_rank, do_send, xmit_to_rank, do_send,
@ -1243,7 +1373,7 @@ public:
xbytes,rbytes); xbytes,rbytes);
} }
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) { if ( (compress.DecompressionStep() && comms_recv) ) {
AddDecompress(&this->u_recv_buf_p[comm_off], AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[comm_off], &recv_buf[comm_off],
words,Decompressions); words,Decompressions);
@ -1265,8 +1395,8 @@ public:
int comms_send = this->_comms_send[point]; int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point]; int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ; // int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ; // int comms_partial_recv = this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@ -1341,18 +1471,20 @@ public:
if ( comms_send ) xbytes = bytes; if ( comms_send ) xbytes = bytes;
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); // else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; else xbytes = 0;
if ( comms_recv ) rbytes = bytes; if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); // else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0; else rbytes = 0;
// Gathers SIMD lanes for send and merge // Gathers SIMD lanes for send and merge
// Different faces can be full comms or partial comms with multiple ranks per node // Different faces can be full comms or partial comms with multiple ranks per node
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { // if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
if ( comms_send || comms_recv ) {
int partial = partialDirichlet; // int partial = partialDirichlet;
int partial = 0;
compressor::Gather_plane_exchange(face_table[face_idx],rhs, compressor::Gather_plane_exchange(face_table[face_idx],rhs,
spointers,dimension,sx,cbmask, spointers,dimension,sx,cbmask,
compress,permute_type,partial ); compress,permute_type,partial );
@ -1418,7 +1550,8 @@ public:
if ( (bytes != rbytes) && (rbytes!=0) ){ if ( (bytes != rbytes) && (rbytes!=0) ){
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
} }
int do_send = (comms_send|comms_partial_send) && (!shm_send ); // int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_send = (comms_send) && (!shm_send );
AddPacket((void *)sp,(void *)rp, AddPacket((void *)sp,(void *)rp,
xmit_to_rank,do_send, xmit_to_rank,do_send,
recv_from_rank,do_send, recv_from_rank,do_send,
@ -1432,7 +1565,8 @@ public:
} }
} }
// rpointer may be doing a remote read in the gather over SHM // rpointer may be doing a remote read in the gather over SHM
if ( comms_recv|comms_partial_recv ) { // if ( comms_recv|comms_partial_recv ) {
if ( comms_recv ) {
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
} }