From 19f9378b9886f2d0ef2028185ab37311080987c2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 11 Mar 2025 13:49:53 +0000 Subject: [PATCH 1/2] Should work on Aurora nowb --- Grid/allocator/AlignedAllocator.h | 2 +- Grid/allocator/MemoryManagerCache.cc | 3 +++ Grid/communicator/Communicator_mpi3.cc | 3 --- Grid/cshift/Cshift_mpi.h | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index a32d69db..316f201c 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -69,7 +69,7 @@ public: } // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop - void construct(pointer __p, const _Tp& __val) { assert(0);}; + void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index eb8c6d38..09afbcf7 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -234,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + if(bytes>=DeviceMaxBytes) { + printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes); + } assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 38b9f9c6..8de29669 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -759,9 +759,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector0) { status.resize(MpiRequests.size()); int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 710792ee..4b000035 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -126,8 +126,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); #ifndef ACCELERATOR_AWARE_MPI - static hostVector hsend_buf; hsend_buf.resize(buffer_size); - static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + static std::vector hsend_buf; hsend_buf.resize(buffer_size); + static std::vector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int cb= (cbmask==0x2)? Odd : Even; @@ -250,8 +250,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice hsend_buf; hsend_buf.resize(buffer_size); - hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + std::vector hsend_buf; hsend_buf.resize(buffer_size); + std::vector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int bytes = buffer_size*sizeof(scalar_object); From 25ab9325e7f319735533bf40ebaa6ba695d3fd52 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 11 Mar 2025 15:02:32 +0000 Subject: [PATCH 2/2] Use hostVector but remove construct resize --- Grid/cshift/Cshift_mpi.h | 9 ++++----- Grid/qcd/action/fermion/WilsonCompressor.h | 1 - Grid/stencil/Stencil.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 4b000035..6f2e2699 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -126,8 +126,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); #ifndef ACCELERATOR_AWARE_MPI - static std::vector hsend_buf; hsend_buf.resize(buffer_size); - static std::vector hrecv_buf; hrecv_buf.resize(buffer_size); + static hostVector hsend_buf; hsend_buf.resize(buffer_size); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int cb= (cbmask==0x2)? Odd : Even; @@ -244,14 +244,13 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice hsend_buf; hsend_buf.resize(buffer_size); - std::vector hrecv_buf; hrecv_buf.resize(buffer_size); + hostVector hsend_buf; hsend_buf.resize(buffer_size); + hostVector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int bytes = buffer_size*sizeof(scalar_object); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 1c6571e1..22f2d8e3 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -485,7 +485,6 @@ public: assert(this->u_comm_offset==this->_unified_buffer_size); accelerator_barrier(); #ifdef NVLINK_GET - #warning "NVLINK_GET" this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check // Or issue barrier AFTER the DMA is running diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 2a666a04..3613cdbb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -518,7 +518,6 @@ public: } accelerator_barrier(); // All my local gathers are complete #ifdef NVLINK_GET - #warning "NVLINK_GET" _grid->StencilBarrier(); // He can now get mu local gather, I can get his // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check // Or issue barrier AFTER the DMA is running