mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Clean up stencil with better intranode Dirichlet / DDHMC support.
14TF/s on a Perlmutter node
This commit is contained in:
parent
47b4e91473
commit
e651b9e7ab
@ -502,7 +502,6 @@ public:
|
|||||||
}
|
}
|
||||||
void AddCopy(void *from,void * to, Integer bytes)
|
void AddCopy(void *from,void * to, Integer bytes)
|
||||||
{
|
{
|
||||||
// std::cout << "Adding CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<bytes<<std::endl;
|
|
||||||
CopyReceiveBuffer obj;
|
CopyReceiveBuffer obj;
|
||||||
obj.from_p = from;
|
obj.from_p = from;
|
||||||
obj.to_p = to;
|
obj.to_p = to;
|
||||||
@ -516,7 +515,7 @@ public:
|
|||||||
cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
|
cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
|
||||||
cobj *to =(cobj *)CopyReceiveBuffers[i].to_p;
|
cobj *to =(cobj *)CopyReceiveBuffers[i].to_p;
|
||||||
Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
|
Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
|
||||||
// std::cout << "CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<words*sizeof(cobj)<<std::endl;
|
|
||||||
accelerator_forNB(j, words, cobj::Nsimd(), {
|
accelerator_forNB(j, words, cobj::Nsimd(), {
|
||||||
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
||||||
});
|
});
|
||||||
@ -542,13 +541,12 @@ public:
|
|||||||
&&(CachedTransfers[i].lane ==lane)
|
&&(CachedTransfers[i].lane ==lane)
|
||||||
&&(CachedTransfers[i].cb ==cb)
|
&&(CachedTransfers[i].cb ==cb)
|
||||||
){
|
){
|
||||||
// std::cout << "Found duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<< " bytes "<<bytes <<std::endl;
|
|
||||||
AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
|
AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::cout << "No duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<<" bytes "<<bytes<<std::endl;
|
|
||||||
CachedTransfers.push_back(obj);
|
CachedTransfers.push_back(obj);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -744,9 +742,6 @@ public:
|
|||||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
int rotate_dim = _grid->_simd_layout[dimension]>2;
|
int rotate_dim = _grid->_simd_layout[dimension]>2;
|
||||||
|
|
||||||
this->_comms_send[ii] = comm_dim;
|
|
||||||
this->_comms_recv[ii] = comm_dim;
|
|
||||||
|
|
||||||
assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
|
assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
|
||||||
|
|
||||||
int sshift[2];
|
int sshift[2];
|
||||||
@ -909,25 +904,30 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Wrap locally dirichlet support case OR node local
|
// Wrap locally dirichlet support case OR node local
|
||||||
if ( (offnode==0) || (comms_recv==0) ) {
|
if ( offnode==0 ) {
|
||||||
|
|
||||||
int permute_slice=0;
|
int permute_slice=0;
|
||||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
if ( comms_recv==0 ) {
|
||||||
|
|
||||||
|
int permute_slice=1;
|
||||||
|
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( offnode ) {
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
if (cbmask != 0x3) words=words>>1;
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
// int rank = grid->_processor;
|
|
||||||
// int recv_from_rank;
|
|
||||||
// int xmit_to_rank;
|
|
||||||
|
|
||||||
int unified_buffer_offset = _unified_buffer_size;
|
|
||||||
_unified_buffer_size += words;
|
_unified_buffer_size += words;
|
||||||
|
|
||||||
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1058,8 +1058,6 @@ public:
|
|||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
if (comm_proc) {
|
if (comm_proc) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
if (cbmask != 0x3) words=words>>1;
|
if (cbmask != 0x3) words=words>>1;
|
||||||
@ -1067,64 +1065,69 @@ public:
|
|||||||
int bytes = words * compress.CommDatumSize();
|
int bytes = words * compress.CommDatumSize();
|
||||||
|
|
||||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||||
if ( !face_table_computed ) {
|
int comm_off = u_comm_offset;
|
||||||
face_table.resize(face_idx+1);
|
|
||||||
std::vector<std::pair<int,int> > face_table_host ;
|
|
||||||
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
|
|
||||||
face_table[face_idx].resize(face_table_host.size());
|
|
||||||
acceleratorCopyToDevice(&face_table_host[0],
|
|
||||||
&face_table[face_idx][0],
|
|
||||||
face_table[face_idx].size()*sizeof(face_table_host[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
// int rank = _grid->_processor;
|
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
cobj *recv_buf;
|
||||||
|
cobj *send_buf;
|
||||||
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
assert (xmit_to_rank != _grid->ThisRank());
|
assert (xmit_to_rank != _grid->ThisRank());
|
||||||
assert (recv_from_rank != _grid->ThisRank());
|
assert (recv_from_rank != _grid->ThisRank());
|
||||||
|
|
||||||
cobj *recv_buf;
|
if( comms_send ) {
|
||||||
if ( compress.DecompressionStep() ) {
|
|
||||||
recv_buf=u_simd_recv_buf[0];
|
if ( !face_table_computed ) {
|
||||||
} else {
|
face_table.resize(face_idx+1);
|
||||||
recv_buf=this->u_recv_buf_p;
|
std::vector<std::pair<int,int> > face_table_host ;
|
||||||
|
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
|
||||||
|
face_table[face_idx].resize(face_table_host.size());
|
||||||
|
acceleratorCopyToDevice(&face_table_host[0],
|
||||||
|
&face_table[face_idx][0],
|
||||||
|
face_table[face_idx].size()*sizeof(face_table_host[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if ( compress.DecompressionStep() ) {
|
||||||
|
recv_buf=u_simd_recv_buf[0];
|
||||||
|
} else {
|
||||||
|
recv_buf=this->u_recv_buf_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
send_buf = this->u_send_buf_p; // Gather locally, must send
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
// Gather locally
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
assert(send_buf!=NULL);
|
||||||
|
|
||||||
|
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
|
||||||
cobj *send_buf;
|
|
||||||
send_buf = this->u_send_buf_p; // Gather locally, must send
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Gather locally
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
assert(send_buf!=NULL);
|
|
||||||
if ( comms_send )
|
|
||||||
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);
|
|
||||||
face_idx++;
|
|
||||||
|
|
||||||
|
|
||||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask);
|
|
||||||
if ( (!duplicate) ) { // Force comms for now
|
if ( (!duplicate) ) { // Force comms for now
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// Build a list of things to do after we synchronise GPUs
|
// Build a list of things to do after we synchronise GPUs
|
||||||
// Start comms now???
|
// Start comms now???
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
AddPacket((void *)&send_buf[u_comm_offset],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[u_comm_offset],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, comms_send,
|
xmit_to_rank, comms_send,
|
||||||
recv_from_rank, comms_recv,
|
recv_from_rank, comms_recv,
|
||||||
bytes);
|
bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( compress.DecompressionStep() ) {
|
if ( compress.DecompressionStep() && comms_recv ) {
|
||||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||||
&recv_buf[u_comm_offset],
|
&recv_buf[comm_off],
|
||||||
words,Decompressions);
|
words,Decompressions);
|
||||||
}
|
}
|
||||||
|
|
||||||
u_comm_offset+=words;
|
u_comm_offset+=words;
|
||||||
|
face_idx++;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@ -1154,7 +1157,6 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
int permute_type=_grid->PermuteType(dimension);
|
int permute_type=_grid->PermuteType(dimension);
|
||||||
// std::cout << "SimdNew permute type "<<permute_type<<std::endl;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
// Simd direction uses an extract/merge pair
|
// Simd direction uses an extract/merge pair
|
||||||
@ -1188,8 +1190,9 @@ public:
|
|||||||
|
|
||||||
if ( any_offnode ) {
|
if ( any_offnode ) {
|
||||||
|
|
||||||
|
int comm_off = u_comm_offset;
|
||||||
for(int i=0;i<maxl;i++){
|
for(int i=0;i<maxl;i++){
|
||||||
spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset];
|
spointers[i] = (cobj *) &u_simd_send_buf[i][comm_off];
|
||||||
}
|
}
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
@ -1198,7 +1201,7 @@ public:
|
|||||||
face_table.resize(face_idx+1);
|
face_table.resize(face_idx+1);
|
||||||
std::vector<std::pair<int,int> > face_table_host ;
|
std::vector<std::pair<int,int> > face_table_host ;
|
||||||
|
|
||||||
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
|
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
|
||||||
face_table[face_idx].resize(face_table_host.size());
|
face_table[face_idx].resize(face_table_host.size());
|
||||||
acceleratorCopyToDevice(&face_table_host[0],
|
acceleratorCopyToDevice(&face_table_host[0],
|
||||||
&face_table[face_idx][0],
|
&face_table[face_idx][0],
|
||||||
@ -1225,8 +1228,8 @@ public:
|
|||||||
int nbr_plane = nbr_ic;
|
int nbr_plane = nbr_ic;
|
||||||
assert (sx == nbr_ox);
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
auto rp = &u_simd_recv_buf[i ][comm_off];
|
||||||
auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset];
|
auto sp = &u_simd_send_buf[nbr_plane][comm_off];
|
||||||
|
|
||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
|
|
||||||
@ -1252,9 +1255,10 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||||
|
|
||||||
u_comm_offset +=buffer_size;
|
u_comm_offset +=buffer_size;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user