diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index ef3aa821..80acb4ae 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -70,57 +70,6 @@ struct DefaultImplParams { void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); -/* -template -void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); - -template -void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) -{ - int num=table.size(); - std::pair *table_v = & table[0]; - - auto rhs_v = rhs.View(AcceleratorRead); - accelerator_forNB( i,num, vobj::Nsimd(), { - compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); - }); - rhs_v.ViewClose(); -} - -/////////////////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split with compression -/////////////////////////////////////////////////////////////////// -template -void Gather_plane_exchange_table(const Lattice &rhs, - commVector pointers, - int dimension,int plane, - int cbmask,compressor &compress,int type) __attribute__((noinline)); - -template -void Gather_plane_exchange_table(commVector >& table, - const Lattice &rhs, - std::vector &pointers,int dimension,int plane,int cbmask, - compressor &compress,int type) -{ - assert( (table.size()&0x1)==0); - int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - - auto rhs_v = rhs.View(AcceleratorRead); - auto rhs_p = &rhs_v[0]; - auto p0=&pointers[0][0]; - auto p1=&pointers[1][0]; - auto tp=&table[0]; - accelerator_forNB(j, num, vobj::Nsimd(), { - compress.CompressExchange(p0,p1, rhs_p, j, - so+tp[2*j ].second, - so+tp[2*j+1].second, - type); - }); - rhs_v.ViewClose(); -} -*/ - void DslashResetCounts(void); void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); void DslashLogFull(void); @@ -258,6 +207,10 @@ public: struct Packet { void * send_buf; void * recv_buf; +#ifndef ACCELERATOR_AWARE_MPI + void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE + void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE +#endif Integer to_rank; Integer from_rank; Integer do_send; @@ -324,7 +277,7 @@ public: Vector surface_list; stencilVector _entries; // Resident in managed memory - commVector _entries_device; // Resident in managed memory + commVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -408,33 +361,16 @@ public: // Use OpenMP Tasks for cleaner ??? // must be called *inside* parallel region ////////////////////////////////////////// - /* - void CommunicateThreaded() - { -#ifdef GRID_OMP - int mythread = omp_get_thread_num(); - int nthreads = CartesianCommunicator::nCommThreads; -#else - int mythread = 0; - int nthreads = 1; -#endif - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - } - } - } - */ //////////////////////////////////////////////////////////////////////// // Non blocking send and receive. Necessarily parallel. //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // All GPU kernel tasks must complete + // accelerator_barrier(); // All kernels should ALREADY be complete + // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + // But the HaloGather had a barrier too. +#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -443,16 +379,54 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } +#else +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } +#endif + // Get comms started then run checksums + // Having this PRIOR to the dslash seems to make Sunspot work... (!) + for(int i=0;i > &reqs) { - _grid->StencilSendToRecvFromComplete(MpiReqs,0); + _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - acceleratorCopySynchronise(); + // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete + // accelerator_barrier(); _grid->StencilBarrier(); +#ifndef ACCELERATOR_AWARE_MPI +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferFreeAll(); +#endif + // run any checksums + for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { + // accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -540,10 +515,9 @@ public: compress.Point(point); HaloGatherDir(source,compress,point,face_idx); } - accelerator_barrier(); + accelerator_barrier(); // All my local gathers are complete face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); - } ///////////////////////// @@ -579,6 +553,7 @@ public: accelerator_forNB(j, words, cobj::Nsimd(), { coalescedWrite(to[j] ,coalescedRead(from [j])); }); + acceleratorFenceComputeStream(); } } @@ -669,6 +644,7 @@ public: for(int i=0;i