1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Clean up stencil with better intranode Dirichlet / DDHMC support.

14TF/s on a Perlmutter node
This commit is contained in:
Peter Boyle 2022-05-24 18:23:39 -07:00
parent 47b4e91473
commit e651b9e7ab

View File

@ -502,7 +502,6 @@ public:
} }
void AddCopy(void *from,void * to, Integer bytes) void AddCopy(void *from,void * to, Integer bytes)
{ {
// std::cout << "Adding CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<bytes<<std::endl;
CopyReceiveBuffer obj; CopyReceiveBuffer obj;
obj.from_p = from; obj.from_p = from;
obj.to_p = to; obj.to_p = to;
@ -516,7 +515,7 @@ public:
cobj *from=(cobj *)CopyReceiveBuffers[i].from_p; cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
cobj *to =(cobj *)CopyReceiveBuffers[i].to_p; cobj *to =(cobj *)CopyReceiveBuffers[i].to_p;
Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj); Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
// std::cout << "CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<words*sizeof(cobj)<<std::endl;
accelerator_forNB(j, words, cobj::Nsimd(), { accelerator_forNB(j, words, cobj::Nsimd(), {
coalescedWrite(to[j] ,coalescedRead(from [j])); coalescedWrite(to[j] ,coalescedRead(from [j]));
}); });
@ -542,13 +541,12 @@ public:
&&(CachedTransfers[i].lane ==lane) &&(CachedTransfers[i].lane ==lane)
&&(CachedTransfers[i].cb ==cb) &&(CachedTransfers[i].cb ==cb)
){ ){
// std::cout << "Found duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<< " bytes "<<bytes <<std::endl;
AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes); AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
return 1; return 1;
} }
} }
// std::cout << "No duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<<" bytes "<<bytes<<std::endl;
CachedTransfers.push_back(obj); CachedTransfers.push_back(obj);
return 0; return 0;
} }
@ -744,9 +742,6 @@ public:
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int rotate_dim = _grid->_simd_layout[dimension]>2; int rotate_dim = _grid->_simd_layout[dimension]>2;
this->_comms_send[ii] = comm_dim;
this->_comms_recv[ii] = comm_dim;
assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
int sshift[2]; int sshift[2];
@ -909,25 +904,30 @@ public:
} }
// Wrap locally dirichlet support case OR node local // Wrap locally dirichlet support case OR node local
if ( (offnode==0) || (comms_recv==0) ) { if ( offnode==0 ) {
int permute_slice=0; int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else { } else {
if ( comms_recv==0 ) {
int permute_slice=1;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
}
}
if ( offnode ) {
int words = buffer_size; int words = buffer_size;
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
// int rank = grid->_processor;
// int recv_from_rank;
// int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words; _unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
} }
} }
} }
@ -1058,8 +1058,6 @@ public:
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc) { if (comm_proc) {
int words = buffer_size; int words = buffer_size;
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
@ -1067,64 +1065,69 @@ public:
int bytes = words * compress.CommDatumSize(); int bytes = words * compress.CommDatumSize();
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
if ( !face_table_computed ) { int comm_off = u_comm_offset;
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
// int rank = _grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
cobj *recv_buf;
cobj *send_buf;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank()); assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank());
cobj *recv_buf; if( comms_send ) {
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0]; if ( !face_table_computed ) {
} else { face_table.resize(face_idx+1);
recv_buf=this->u_recv_buf_p; std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
send_buf = this->u_send_buf_p; // Gather locally, must send
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
assert(send_buf!=NULL);
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
} }
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
cobj *send_buf;
send_buf = this->u_send_buf_p; // Gather locally, must send
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
assert(send_buf!=NULL);
if ( comms_send )
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);
face_idx++;
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask);
if ( (!duplicate) ) { // Force comms for now if ( (!duplicate) ) { // Force comms for now
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// Build a list of things to do after we synchronise GPUs // Build a list of things to do after we synchronise GPUs
// Start comms now??? // Start comms now???
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
AddPacket((void *)&send_buf[u_comm_offset], AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[u_comm_offset], (void *)&recv_buf[comm_off],
xmit_to_rank, comms_send, xmit_to_rank, comms_send,
recv_from_rank, comms_recv, recv_from_rank, comms_recv,
bytes); bytes);
} }
if ( compress.DecompressionStep() ) { if ( compress.DecompressionStep() && comms_recv ) {
AddDecompress(&this->u_recv_buf_p[u_comm_offset], AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[u_comm_offset], &recv_buf[comm_off],
words,Decompressions); words,Decompressions);
} }
u_comm_offset+=words; u_comm_offset+=words;
face_idx++;
} }
} }
return 0; return 0;
@ -1154,7 +1157,6 @@ public:
int permute_type=_grid->PermuteType(dimension); int permute_type=_grid->PermuteType(dimension);
// std::cout << "SimdNew permute type "<<permute_type<<std::endl;
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Simd direction uses an extract/merge pair // Simd direction uses an extract/merge pair
@ -1188,8 +1190,9 @@ public:
if ( any_offnode ) { if ( any_offnode ) {
int comm_off = u_comm_offset;
for(int i=0;i<maxl;i++){ for(int i=0;i<maxl;i++){
spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset]; spointers[i] = (cobj *) &u_simd_send_buf[i][comm_off];
} }
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
@ -1198,7 +1201,7 @@ public:
face_table.resize(face_idx+1); face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ; std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
face_table[face_idx].resize(face_table_host.size()); face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0], acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0], &face_table[face_idx][0],
@ -1225,8 +1228,8 @@ public:
int nbr_plane = nbr_ic; int nbr_plane = nbr_ic;
assert (sx == nbr_ox); assert (sx == nbr_ox);
auto rp = &u_simd_recv_buf[i ][u_comm_offset]; auto rp = &u_simd_recv_buf[i ][comm_off];
auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset]; auto sp = &u_simd_send_buf[nbr_plane][comm_off];
if(nbr_proc){ if(nbr_proc){
@ -1252,9 +1255,10 @@ public:
} }
} }
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
u_comm_offset +=buffer_size; u_comm_offset +=buffer_size;
} }
} }
return 0; return 0;