From 84cab5e6e7c1cd23efe4d42d2d34fd1b31ac66fa Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 1 Feb 2025 16:37:21 +0100 Subject: [PATCH 1/5] no comms and log cleanup --- Grid/algorithms/FFT.h | 12 ++++++------ Grid/communicator/Communicator_base.h | 3 ++- Grid/util/Lexicographic.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index dc972537..329d1d46 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -191,7 +191,7 @@ public: Lattice pgbuf(&pencil_g); autoView(pgbuf_v , pgbuf, CpuWrite); - std::cout << "CPU view" << std::endl; + //std::cout << "CPU view" << std::endl; typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -215,7 +215,7 @@ public: else if ( sign == forward ) div = 1.0; else assert(0); - std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; + //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -229,7 +229,7 @@ public: } // Barrel shift and collect global pencil - std::cout << GridLogPerformance<<"Making pencil" << std::endl; + //std::cout << GridLogPerformance<<"Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd); result = source; int pc = processor_coor[dim]; @@ -251,7 +251,7 @@ public: } } - std::cout <::fftw_destroy_plan(p); #endif diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index c72fcc79..964a523b 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -147,7 +147,8 @@ public: sizeof(obj),d*100+p); } - CommsComplete(list); + if (!list.empty()) // avoid triggering assert in comms == none + CommsComplete(list); for(int p=1;p<_processors[d];p++){ accum = accum + column[p]; } diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h index 422e42ee..8636333d 100644 --- a/Grid/util/Lexicographic.h +++ b/Grid/util/Lexicographic.h @@ -50,7 +50,7 @@ namespace Grid{ int64_t index64; IndexFromCoorReversed(coor,index64,dims); if ( index64>=2*1024*1024*1024LL ){ - std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "< Date: Tue, 11 Mar 2025 13:49:53 +0000 Subject: [PATCH 2/5] Should work on Aurora nowb --- Grid/allocator/AlignedAllocator.h | 2 +- Grid/allocator/MemoryManagerCache.cc | 3 +++ Grid/communicator/Communicator_mpi3.cc | 3 --- Grid/cshift/Cshift_mpi.h | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index a32d69db..316f201c 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -69,7 +69,7 @@ public: } // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop - void construct(pointer __p, const _Tp& __val) { assert(0);}; + void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index eb8c6d38..09afbcf7 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -234,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + if(bytes>=DeviceMaxBytes) { + printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes); + } assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 38b9f9c6..8de29669 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -759,9 +759,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector0) { status.resize(MpiRequests.size()); int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 710792ee..4b000035 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -126,8 +126,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); #ifndef ACCELERATOR_AWARE_MPI - static hostVector hsend_buf; hsend_buf.resize(buffer_size); - static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + static std::vector hsend_buf; hsend_buf.resize(buffer_size); + static std::vector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int cb= (cbmask==0x2)? Odd : Even; @@ -250,8 +250,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice hsend_buf; hsend_buf.resize(buffer_size); - hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + std::vector hsend_buf; hsend_buf.resize(buffer_size); + std::vector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int bytes = buffer_size*sizeof(scalar_object); From 25ab9325e7f319735533bf40ebaa6ba695d3fd52 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 11 Mar 2025 15:02:32 +0000 Subject: [PATCH 3/5] Use hostVector but remove construct resize --- Grid/cshift/Cshift_mpi.h | 9 ++++----- Grid/qcd/action/fermion/WilsonCompressor.h | 1 - Grid/stencil/Stencil.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 4b000035..6f2e2699 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -126,8 +126,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); #ifndef ACCELERATOR_AWARE_MPI - static std::vector hsend_buf; hsend_buf.resize(buffer_size); - static std::vector hrecv_buf; hrecv_buf.resize(buffer_size); + static hostVector hsend_buf; hsend_buf.resize(buffer_size); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int cb= (cbmask==0x2)? Odd : Even; @@ -244,14 +244,13 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice hsend_buf; hsend_buf.resize(buffer_size); - std::vector hrecv_buf; hrecv_buf.resize(buffer_size); + hostVector hsend_buf; hsend_buf.resize(buffer_size); + hostVector hrecv_buf; hrecv_buf.resize(buffer_size); #endif int bytes = buffer_size*sizeof(scalar_object); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 1c6571e1..22f2d8e3 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -485,7 +485,6 @@ public: assert(this->u_comm_offset==this->_unified_buffer_size); accelerator_barrier(); #ifdef NVLINK_GET - #warning "NVLINK_GET" this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check // Or issue barrier AFTER the DMA is running diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 2a666a04..3613cdbb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -518,7 +518,6 @@ public: } accelerator_barrier(); // All my local gathers are complete #ifdef NVLINK_GET - #warning "NVLINK_GET" _grid->StencilBarrier(); // He can now get mu local gather, I can get his // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check // Or issue barrier AFTER the DMA is running From e9177e4af3acd572666d7f66a944cf0539045d7d Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 13 Mar 2025 08:48:23 +0000 Subject: [PATCH 4/5] Blas compatibility --- Grid/algorithms/blas/BatchedBlas.h | 147 ++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 21 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index f4245319..3a7bbc44 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -208,8 +208,8 @@ public: assert(Bkn.size()==batchCount); assert(Cmn.size()==batchCount); - assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose - assert(OpB!=GridBLAS_OP_T); + //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + //assert(OpB!=GridBLAS_OP_T); int lda = m; // m x k column major int ldb = k; // k x n column major @@ -367,28 +367,67 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + else + eCmn = alpha * eAmk.adjoint() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + else + eCmn = alpha * eAmk * eBkn.adjoint() ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + else + eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ; + } ); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; } ); } else { assert(0); @@ -414,8 +453,8 @@ public: RealD t2=usecond(); int32_t batchCount = Amk.size(); - assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose - assert(OpB!=GridBLAS_OP_T); + //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + //assert(OpB!=GridBLAS_OP_T); int lda = m; // m x k column major int ldb = k; // k x n column major @@ -514,28 +553,70 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + else + eCmn = alpha * eAmk.adjoint() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + else + eCmn = alpha * eAmk * eBkn.adjoint() ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + else + eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ; + } ); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; } ); } else { assert(0); @@ -661,29 +742,41 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; - } ); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; + }); } else { assert(0); } @@ -809,28 +902,40 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; }); } else { assert(0); From fe66c7ca304b2726f8d4a2f66dfc006b544190f8 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 13 Mar 2025 12:49:36 +0000 Subject: [PATCH 5/5] verbosity --- Grid/communicator/SharedMemoryMPI.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index dc22aee0..9b82658a 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -542,7 +542,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI - printf("Host buffer allocate for GPU non-aware MPI\n"); + // printf("Host buffer allocate for GPU non-aware MPI\n"); #if 0 HostCommBuf= acceleratorAllocHost(bytes); #else