1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-03 18:55:56 +01:00

Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384

nodes.
More concurrency/fine grained scheduling is possible.
This commit is contained in:
Peter Boyle 2025-02-04 19:27:26 +00:00
parent b50fb34e71
commit 0baaddbe98
9 changed files with 151 additions and 81 deletions

View File

@ -192,6 +192,11 @@ public:
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,

View File

@ -399,6 +399,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@ -561,53 +563,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
#else
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
const int chunks=1;
for(int n=0;n<chunks;n++){
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#endif
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
@ -644,37 +698,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
* - complete all copies
* - post MPI send asynch
*/
// static int printed;
// if((printed<8) && this->IsBoss() ) {
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
// printed++;
// }
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
// Find the send in the prepared list
int list_idx=-1;
for(int idx = 0; idx<list.size();idx++){
if ( (list[idx].device_buf==xmit)
&&(list[idx].PacketType==InterNodeXmit)
&&(list[idx].bytes==xbytes) ) {
list_idx = idx;
host_xmit = list[idx].host_buf;
}
}
assert(list_idx != -1); // found it
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[list_idx].req = xrq; // Update the MPI request in the list
off_node_bytes+=xbytes;
#endif
} else {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
@ -686,7 +713,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
{
int nreq=list.size();
if (nreq==0) return;
// if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
std::vector<MPI_Request> MpiRequests(nreq);
@ -694,16 +721,17 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
MpiRequests[r] = list[r].req;
}
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // must at least wait for sends
assert(ierr==0);
for(int r=0;r<nreq;r++){
if ( list[r].PacketType==InterNodeRecv ) {
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
}
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
acceleratorCopySynchronise(); // Complete all pending copy transfers
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
this->StencilBarrier();

View File

@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return xbytes+rbytes;
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,

View File

@ -50,12 +50,16 @@ typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv, InterNodeXmitISend, InterNodeReceiveHtoD };
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif

View File

@ -363,12 +363,16 @@ public:
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Begin "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Communicate begin");
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
@ -376,8 +380,15 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
// std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
// std::cout << "Communicate CopySynch "<<std::endl;
// _grid->Barrier();
acceleratorCopySynchronise();
// Starts intranode
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate Begin "<<i<<std::endl;
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
@ -395,7 +406,14 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Complete "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Start communicate complete");
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
// std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
@ -663,7 +681,6 @@ public:
}
}
}
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0;
@ -683,6 +700,7 @@ public:
}
}
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)

View File

@ -343,9 +343,26 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
///////
// Asynch event interface
///////
typedef sycl::event acceleratorEvent_t;
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
ev.wait();
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
{
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@ -358,8 +375,10 @@ inline int acceleratorIsCommunicable(void *ptr)
else return 0;
#endif
return 1;
}
#endif
//////////////////////////////////////////////

View File

@ -52,7 +52,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
int Ls=8;
int Ls=16;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;

View File

@ -32,15 +32,9 @@ export MPICH_OFI_NIC_POLICY=GPU
# Local vol 16.16.16.32
#
VOL 128.64.128.96
MPI 4.4.4.3
NPROC 192
mpiexec -np 192 -ppn 12 -envall ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 4.4.4.3 --grid 128.64.128.96 --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap
LX=32
LX=16
LY=16
LZ=32
LZ=16
LT=32
NX=2

View File

@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1