diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index c72fcc79..78385c09 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -192,6 +192,11 @@ public: void *recv, int recv_from_rank,int do_recv, int xbytes,int rbytes,int dir); + + // Could do a PollHtoD and have a CommsMerge dependence + void StencilSendToRecvFromPollDtoH (std::vector &list); + void StencilSendToRecvFromPollIRecv(std::vector &list); + double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int do_xmit, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 7dc706df..85bdd049 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -399,6 +399,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, #ifdef ACCELERATOR_AWARE_MPI +void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector &list) {}; +void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector &list) {}; double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, void *xmit, int dest,int dox, @@ -561,53 +563,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(xbytes); - acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch + CommsRequest_t srq; + + srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); // assert(ierr==0); // off_node_bytes+=xbytes; - CommsRequest_t srq; srq.PacketType = InterNodeXmit; srq.bytes = xbytes; // srq.req = xrq; srq.host_buf = host_xmit; srq.device_buf = xmit; + srq.tag = tag; + srq.dest = dest; + srq.commdir = commdir; list.push_back(srq); -#else - tag= dir+_processor*32; - - host_xmit = this->HostBufferMalloc(xbytes); - const int chunks=1; - for(int n=0;n &list) +{ + int pending = 0; + do { + + pending = 0; + + for(int idx = 0; idx &list) +{ + int pending = 0; + do { + + pending = 0; + + for(int idx = 0; idx &list, void *xmit, @@ -644,37 +698,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorIsBoss() ) { - // printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); - // printed++; - // } - if (dox) { - if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { -#ifdef DEVICE_TO_HOST_CONCURRENT - tag= dir+_processor*32; - // Find the send in the prepared list - int list_idx=-1; - for(int idx = 0; idxShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); @@ -686,7 +713,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); std::vector MpiRequests(nreq); @@ -694,16 +721,17 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorHostBufferFreeAll(); // Clean up the buffer allocs this->StencilBarrier(); diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 8e6206ef..f162a903 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { return 2.0*bytes; } +void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector &list) {}; +void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector &list) {}; double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, void *xmit, int xmit_to_rank,int dox, @@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, void *xmit, diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 422be8aa..1a389322 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -50,12 +50,16 @@ typedef MPI_Request MpiCommsRequest_t; #ifdef ACCELERATOR_AWARE_MPI typedef MPI_Request CommsRequest_t; #else -enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv }; +enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv, InterNodeXmitISend, InterNodeReceiveHtoD }; typedef struct { PacketType_t PacketType; void *host_buf; void *device_buf; + int dest; + int tag; + int commdir; unsigned long bytes; + acceleratorEvent_t ev; MpiCommsRequest_t req; } CommsRequest_t; #endif diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1142891a..fa03183c 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -363,12 +363,16 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // std::cout << "Communicate Begin "<Barrier(); FlightRecorder::StepLog("Communicate begin"); // All GPU kernel tasks must complete // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iBarrier(); _grid->StencilSendToRecvFromPrepare(MpiReqs, Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, @@ -376,8 +380,15 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } + // std::cout << "Communicate PollDtoH "<Barrier(); + _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/ + // std::cout << "Communicate CopySynch "<Barrier(); acceleratorCopySynchronise(); + // Starts intranode for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, @@ -395,7 +406,14 @@ public: void CommunicateComplete(std::vector > &reqs) { + // std::cout << "Communicate Complete "<Barrier(); FlightRecorder::StepLog("Start communicate complete"); + // std::cout << "Communicate Complete PollIRecv "<Barrier(); + _grid->StencilSendToRecvFromPollIRecv(MpiReqs); + // std::cout << "Communicate Complete Complete "<Barrier(); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); @@ -663,7 +681,6 @@ public: } } } - std::cout << "BuildSurfaceList size is "< surface_list_host(surface_list_size); int32_t ss=0; @@ -683,6 +700,7 @@ public: } } acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int)); + std::cout << GridLogMessage<<"BuildSurfaceList size is "<wait(); } -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} -inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); } + +/////// +// Asynch event interface +/////// +typedef sycl::event acceleratorEvent_t; + +inline void acceleratorEventWait(acceleratorEvent_t ev) +{ + ev.wait(); +} + +inline int acceleratorEventIsComplete(acceleratorEvent_t ev) +{ + return (ev.get_info() == sycl::info::event_command_status::complete); +} + +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } + inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} @@ -358,8 +375,10 @@ inline int acceleratorIsCommunicable(void *ptr) else return 0; #endif return 1; + } + #endif ////////////////////////////////////////////// diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index cbe1ee23..ce4fcfab 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -52,7 +52,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - int Ls=8; + int Ls=16; for(int i=0;i> Ls; diff --git a/systems/Aurora/benchmarks/bench16.pbs b/systems/Aurora/benchmarks/bench16.pbs index fc4f3c8f..b3d3a461 100644 --- a/systems/Aurora/benchmarks/bench16.pbs +++ b/systems/Aurora/benchmarks/bench16.pbs @@ -32,15 +32,9 @@ export MPICH_OFI_NIC_POLICY=GPU # Local vol 16.16.16.32 # -VOL 128.64.128.96 -MPI 4.4.4.3 -NPROC 192 -mpiexec -np 192 -ppn 12 -envall ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 4.4.4.3 --grid 128.64.128.96 --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap - - -LX=32 +LX=16 LY=16 -LZ=32 +LZ=16 LT=32 NX=2 diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh index 5c9cf7be..f8f0ae96 100755 --- a/systems/Aurora/benchmarks/gpu_tile.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1