diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 6b6c9dec..7dc706df 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -408,8 +408,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest,int dox, void *recv, @@ -470,6 +469,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorStencilBarrier(); } #else /* NOT ... ACCELERATOR_AWARE_MPI */ @@ -481,10 +481,10 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorHostBufferMalloc(xbytes); @@ -577,11 +579,30 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorShmBufferTranslate(dest,recv); - assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + host_xmit = this->HostBufferMalloc(xbytes); + const int chunks=1; + for(int n=0;nIsBoss() ) { - printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); - printed=1; - } + // static int printed; + // if((printed<8) && this->IsBoss() ) { + // printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); + // printed++; + // } if (dox) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { +#ifdef DEVICE_TO_HOST_CONCURRENT tag= dir+_processor*32; // Find the send in the prepared list int list_idx=-1; @@ -652,7 +673,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + } } return off_node_bytes; } @@ -680,6 +706,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorHostBufferFreeAll(); // Clean up the buffer allocs + this->StencilBarrier(); } #endif //////////////////////////////////////////// diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ce11714f..dc22aee0 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -543,6 +543,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI printf("Host buffer allocate for GPU non-aware MPI\n"); +#if 0 + HostCommBuf= acceleratorAllocHost(bytes); +#else HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host #ifdef HAVE_NUMAIF_H #warning "Moving host buffers to specific NUMA domain" @@ -569,6 +572,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) } #endif acceleratorPin(HostCommBuf,bytes); +#endif + #endif ShmCommBuf = acceleratorAllocDevice(bytes); if (ShmCommBuf == (void *)NULL ) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 14132cef..3d4e5cc5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -332,7 +332,8 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, // std::cout << " WilsonFermion5D Communicate Begin " < > requests; -#ifndef GRID_ACCELERATED + +#if 1 ///////////////////////////// // Overlap with comms ///////////////////////////// @@ -352,7 +353,8 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); } -#ifdef GRID_ACCELERATED + //ifdef GRID_ACCELERATED +#if 0 ///////////////////////////// // Overlap with comms -- on GPU the interior kernel call is nonblocking ///////////////////////////// diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 2de50e9c..1142891a 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -376,6 +376,7 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } + acceleratorCopySynchronise(); for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -401,7 +402,6 @@ public: else DslashLogFull(); // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // accelerator_barrier(); - _grid->StencilBarrier(); for(int i=0;iwait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; +inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; +inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; @@ -441,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) } \ } +inline void *acceleratorAllocHost(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMallocHost((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr); + } + return ptr; +}; inline void *acceleratorAllocShared(size_t bytes) { void *ptr=NULL; @@ -464,6 +489,7 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; +inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} @@ -546,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H +inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; +inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; #else diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index ce4fcfab..cbe1ee23 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -52,7 +52,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - int Ls=16; + int Ls=8; for(int i=0;i> Ls; diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh index ef64299c..a622ba3e 100755 --- a/systems/Aurora/benchmarks/gpu_tile.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -19,12 +19,12 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -export MPI_BUF_NUMA=$NUMAH +#export MPI_BUF_NUMA=$NUMAH echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "