From 2877fb4a2c5717b8ff2100a68547e532de91ce53 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 18 Jun 2024 03:21:03 +0000 Subject: [PATCH 01/15] More verbose if alloc failure --- Grid/allocator/AlignedAllocator.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 8a27f527..293ce2fb 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -54,6 +54,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } @@ -100,6 +103,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } @@ -145,6 +151,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } From 63c223ea5de95dd694c12bb228be82b1e7bfff98 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 18 Jun 2024 03:22:01 +0000 Subject: [PATCH 02/15] Verbose --- Grid/qcd/hmc/GenericHMCrunner.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index 1429d848..b53755aa 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -90,6 +90,7 @@ public: exit(1); } Parameters.StartingType = arg; + std::cout < ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.StartTrajectory = ivec[0]; + std::cout < ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.Trajectories = ivec[0]; + std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "< ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.NoMetropolisUntil = ivec[0]; + std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "< Date: Fri, 28 Jun 2024 16:02:29 +0000 Subject: [PATCH 03/15] Try catch exception wrappers --- Grid/Namespace.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/Namespace.h b/Grid/Namespace.h index 29b229fa..0d93ee9f 100644 --- a/Grid/Namespace.h +++ b/Grid/Namespace.h @@ -30,9 +30,13 @@ directory #include #include +#include #define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_END(A) } #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) #define GRID_NAMESPACE_END NAMESPACE_END(Grid) #define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same::value,"Not in :: at" ); + +#define EXCEPTION_CHECK_BEGIN(A) try { +#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "< Date: Fri, 28 Jun 2024 16:05:17 +0000 Subject: [PATCH 04/15] Display linux heap info --- Grid/allocator/MemoryManager.cc | 23 ++++++++++++++++++++++- Grid/allocator/MemoryManager.h | 1 + 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index a9e5c9b4..c71f3512 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -16,6 +16,27 @@ NAMESPACE_BEGIN(Grid); uint64_t total_shared; uint64_t total_device; uint64_t total_host;; + +void MemoryManager::DisplayMallinfo(void) +{ +#ifdef __linux__ + struct mallinfo mi; + + mi = mallinfo(); + + printf("Total non-mmapped bytes (arena): %d\n", mi.arena); + printf("# of free chunks (ordblks): %d\n", mi.ordblks); + printf("# of free fastbin blocks (smblks): %d\n", mi.smblks); + printf("# of mapped regions (hblks): %d\n", mi.hblks); + printf("Bytes in mapped regions (hblkhd): %d\n", mi.hblkhd); + printf("Max. total allocated space (usmblks): %d\n", mi.usmblks); + printf("Free bytes held in fastbins (fsmblks): %d\n", mi.fsmblks); + printf("Total allocated space (uordblks): %d\n", mi.uordblks); + printf("Total free space (fordblks): %d\n", mi.fordblks); + printf("Topmost releasable block (keepcost): %d\n", mi.keepcost); +#endif +} + void MemoryManager::PrintBytes(void) { std::cout << " MemoryManager : ------------------------------------ "< Date: Fri, 5 Jul 2024 17:46:43 +0000 Subject: [PATCH 05/15] Sanitizer compile options --- systems/Aurora/config-command-sanitize | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 systems/Aurora/config-command-sanitize diff --git a/systems/Aurora/config-command-sanitize b/systems/Aurora/config-command-sanitize new file mode 100644 index 00000000..d400a103 --- /dev/null +++ b/systems/Aurora/config-command-sanitize @@ -0,0 +1,22 @@ +# -fsycl-targets=spir64_gen -Xs\" -device pvc \" +# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc +# -fsycl-targets=intel_gpu_pvc + +unset DEVICE +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=address" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -Xarch_host -fsanitize=address -fsycl-targets=spir64_gen -Xs -device -Xs pvc " +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-debug \ + --disable-gparity \ + --disable-fermion-reps \ + --with-lime=$CLIME \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-accelerator-aware-mpi=yes\ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx + From 89fdd7f8dd56843b6e3b022569f6d59f1c09d9e8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 5 Jul 2024 17:47:56 +0000 Subject: [PATCH 06/15] AOT compilation --- systems/Aurora/config-command | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 58eb8a03..f538f319 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,16 +1,18 @@ +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ + --enable-debug \ --disable-gparity \ --disable-fermion-reps \ + --with-lime=$CLIME \ --enable-shm=nvlink \ --enable-accelerator=sycl \ --enable-accelerator-aware-mpi=yes\ --enable-unified=no \ MPICXX=mpicxx \ - CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" + CXX=icpx From 172c75029ed84bdebc0667380819f3459f38cb54 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 10 Jul 2024 22:03:59 +0000 Subject: [PATCH 07/15] Redblack additional case --- Grid/algorithms/iterative/SchurRedBlack.h | 81 +++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h index d97e4993..494aa77b 100644 --- a/Grid/algorithms/iterative/SchurRedBlack.h +++ b/Grid/algorithms/iterative/SchurRedBlack.h @@ -499,6 +499,87 @@ namespace Grid { } }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// + // Site diagonal is identity, left preconditioned by Mee^inv + // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta + // + // Solve: + // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta + // + // Old notation e<->o + // + // Left precon by Moo^-1 + // b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o + // eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e) + /////////////////////////////////////////////////////////////////////////////////////////////////////// + template class SchurRedBlackDiagOneSolve : public SchurRedBlackBase { + public: + typedef CheckerBoardedSparseMatrixBase Matrix; + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations Schur trick + ///////////////////////////////////////////////////// + SchurRedBlackDiagOneSolve(OperatorFunction &HermitianRBSolver, const bool initSubGuess = false, + const bool _solnAsInitGuess = false) + : SchurRedBlackBase(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {}; + + virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) + { + GridBase *grid = _Matrix.RedBlackGrid(); + GridBase *fgrid= _Matrix.Grid(); + + SchurDiagOneOperator _HermOpEO(_Matrix); + + Field tmp(grid); + Field Mtmp(grid); + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd ,src_o,src); + + ///////////////////////////////////////////////////// + // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) + ///////////////////////////////////////////////////// + _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); + _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); + Mtmp=src_o-Mtmp; + _Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd); + + // get the right MpcDag + _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); + } + + virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) + { + GridBase *grid = _Matrix.RedBlackGrid(); + GridBase *fgrid= _Matrix.Grid(); + + Field tmp(grid); + Field sol_e(grid); + + + /////////////////////////////////////////////////// + // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... + /////////////////////////////////////////////////// + _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); + tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); + _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); + + setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); + setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); + }; + + virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) + { + SchurDiagOneOperator _HermOpEO(_Matrix); + this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); + }; + virtual void RedBlackSolve (Matrix & _Matrix,const std::vector &src_o, std::vector &sol_o) + { + SchurDiagOneOperator _HermOpEO(_Matrix); + this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); + } + }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// // Site diagonal is identity, right preconditioned by Mee^inv // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta From 7c246606c19957ff9937a6280fd1224bbfca0118 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 10 Jul 2024 22:04:32 +0000 Subject: [PATCH 08/15] Schur additional case --- tests/debug/Test_cayley_cg.cc | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc index 74492fd9..068c260f 100644 --- a/tests/debug/Test_cayley_cg.cc +++ b/tests/debug/Test_cayley_cg.cc @@ -392,9 +392,27 @@ void TestCGschur(What & Ddwf, GridParallelRNG *RNG5) { LatticeFermion src (FGrid); random(*RNG5,src); - LatticeFermion result(FGrid); result=Zero(); + LatticeFermion result1(FGrid); result1=Zero(); + LatticeFermion result2(FGrid); result2=Zero(); + LatticeFermion result3(FGrid); result3=Zero(); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackDiagMooeeSolve SchurSolver(CG); - SchurSolver(Ddwf,src,result); + SchurSolver(Ddwf,src,result1); + + SchurRedBlackDiagOneSolve SchurSolverSymm1(CG); + SchurSolverSymm1(Ddwf,src,result2); + + SchurRedBlackDiagTwoSolve SchurSolverSymm2(CG); + SchurSolverSymm2(Ddwf,src,result3); + + std::cout << GridLogMessage << " Standard " < Date: Thu, 11 Jul 2024 15:19:19 +0000 Subject: [PATCH 09/15] Namespace addition --- Grid/Namespace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/Namespace.h b/Grid/Namespace.h index 0d93ee9f..c42b46b3 100644 --- a/Grid/Namespace.h +++ b/Grid/Namespace.h @@ -40,3 +40,4 @@ directory #define EXCEPTION_CHECK_BEGIN(A) try { #define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "< Date: Thu, 11 Jul 2024 15:19:49 +0000 Subject: [PATCH 10/15] Batched blas, but not working yet on OneAPI --- Grid/algorithms/blas/BatchedBlas.h | 120 +++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 23 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index a7edb485..22353d49 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -89,9 +89,10 @@ public: gridblasHandle = theGridAccelerator; #endif #ifdef GRID_ONE_MKL - cl::sycl::cpu_selector selector; + cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; - gridblasHandle =new sycl::queue (selectedDevice); + cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; + gridblasHandle =new sycl::queue (selectedDevice,q_prop); #endif gridblasInit=1; } @@ -266,8 +267,46 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + std::cerr << " Calling SYCL batched ZGEMM "<()); + synchronise(); + std::cerr << " Called SYCL batched ZGEMM "< A(m*k); // pointer list to matrices + std::vector B(k*n); + std::vector C(m*n); + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; + for (int p = 0; p < 1; ++p) { + acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD)); + acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD)); + acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD)); + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k; ++kk) + c_mn += A[mm + kk*lda ] * B[kk + nn*ldb]; + std::cout << " beta "<gemm_batch & OneAPI -#warning "oneMKL implementation not built " + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (ComplexF *) &alpha_p[0], + (const ComplexF **)&Amk[0], (const int64_t *)&lda64, + (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, + (ComplexF *) &beta_p[0], + (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -467,8 +522,25 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (float *) &alpha_p[0], + (const float **)&Amk[0], (const int64_t *)&lda64, + (const float **)&Bkn[0], (const int64_t *)&ldb64, + (float *) &beta_p[0], + (float **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -568,24 +640,25 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - /* int64_t m64=m; int64_t n64=n; int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; int64_t batchCount64=batchCount; - oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, - onemkl::transpose::N, - onemkl::transpose::N, - &m64,&n64,&k64, - (double *) &alpha_p[0], - (double **)&Amk[0], lda, - (double **)&Bkn[0], ldb, - (double *) &beta_p[0], - (double **)&Cmn[0], ldc, - 1,&batchCount64); - */ - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (double *) &alpha_p[0], + (const double **)&Amk[0], (const int64_t *)&lda64, + (const double **)&Bkn[0], (const int64_t *)&ldb64, + (double *) &beta_p[0], + (double **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -673,6 +746,7 @@ public: beta, (ComplexD *)Cmn,ldc,sdc, batchCount); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) // Need a default/reference implementation From 4502a8c8a185cec95ff0ff61c6d9122fb915ee34 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:22:18 +0000 Subject: [PATCH 11/15] libc malloc heap info dump on Linux --- Grid/allocator/MemoryManager.cc | 37 ++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index c71f3512..e56af238 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -17,6 +17,21 @@ uint64_t total_shared; uint64_t total_device; uint64_t total_host;; +#if defined(__has_feature) +#if __has_feature(leak_sanitizer) +#define ASAN_LEAK_CHECK +#endif +#endif + +#ifdef ASAN_LEAK_CHECK +#include +#include +#include +#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); } +#else +#define LEAK_CHECK(A) { } +#endif + void MemoryManager::DisplayMallinfo(void) { #ifdef __linux__ @@ -24,17 +39,19 @@ void MemoryManager::DisplayMallinfo(void) mi = mallinfo(); - printf("Total non-mmapped bytes (arena): %d\n", mi.arena); - printf("# of free chunks (ordblks): %d\n", mi.ordblks); - printf("# of free fastbin blocks (smblks): %d\n", mi.smblks); - printf("# of mapped regions (hblks): %d\n", mi.hblks); - printf("Bytes in mapped regions (hblkhd): %d\n", mi.hblkhd); - printf("Max. total allocated space (usmblks): %d\n", mi.usmblks); - printf("Free bytes held in fastbins (fsmblks): %d\n", mi.fsmblks); - printf("Total allocated space (uordblks): %d\n", mi.uordblks); - printf("Total free space (fordblks): %d\n", mi.fordblks); - printf("Topmost releasable block (keepcost): %d\n", mi.keepcost); + std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena< Date: Thu, 11 Jul 2024 15:24:11 +0000 Subject: [PATCH 12/15] Device vector not UVM --- Grid/lattice/Lattice_reduction.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 7b66c31d..6ce70232 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -373,7 +373,8 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt nrm = real(TensorRemove(sum(inner_tmp_v,sites))); #else typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); + deviceVector inner_tmp; + inner_tmp.resize(sites); auto inner_tmp_v = &inner_tmp[0]; accelerator_for( ss, sites, nsimd,{ From cf3584ad15dab0a7b949a1f463ce6b80db6580c6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:30:32 +0000 Subject: [PATCH 13/15] Convenient to monitor memory across an HMC trajectory --- Grid/qcd/hmc/integrators/Integrator.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 0276b6fd..549920a0 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -137,9 +137,11 @@ public: double start_force = usecond(); + MemoryManager::Print(); as[level].actions.at(a)->deriv_timer_start(); as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); + MemoryManager::Print(); auto name = as[level].actions.at(a)->action_name(); @@ -246,7 +248,11 @@ public: } }; - virtual ~Integrator() {} + virtual ~Integrator() + { + // Pain in the ass to clean up the Level pointers + // Guido's design is at fault as per comment above in constructor + } virtual std::string integrator_name() = 0; @@ -460,6 +466,7 @@ public: for (int level = 0; level < as.size(); ++level) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { + MemoryManager::Print(); // get gauge field from the SmearingPolicy and // based on the boolean is_smeared in actionID std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; @@ -468,6 +475,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; + MemoryManager::Print(); } as[level].apply(S_hireps, Representations, level, H); From b3ee8ded969351f2d9a36d9589c5b3dd458eed23 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:34:48 +0000 Subject: [PATCH 14/15] Respect command line --- HMC/Mobius2p1f.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc index 4ab1f20f..8042d6e6 100644 --- a/HMC/Mobius2p1f.cc +++ b/HMC/Mobius2p1f.cc @@ -58,7 +58,7 @@ int main(int argc, char **argv) { HMCparameters HMCparams; HMCparams.StartTrajectory = 0; HMCparams.Trajectories = 200; - HMCparams.NoMetropolisUntil= 20; + HMCparams.NoMetropolisUntil= 0; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; HMCparams.StartingType =std::string("ColdStart"); HMCparams.MD = MD; @@ -70,7 +70,7 @@ int main(int argc, char **argv) { CheckpointerParameters CPparams; CPparams.config_prefix = "ckpoint_EODWF_lat"; CPparams.rng_prefix = "ckpoint_EODWF_rng"; - CPparams.saveInterval = 10; + CPparams.saveInterval = 1; CPparams.format = "IEEE64BIG"; TheHMC.Resources.LoadNerscCheckpointer(CPparams); @@ -186,6 +186,8 @@ int main(int argc, char **argv) { ///////////////////////////////////////////////////////////// // HMC parameters are serialisable + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); std::cout << GridLogMessage << " Running the HMC "<< std::endl; TheHMC.Run(); // no smearing From 059e8e5bb0b4334ba3e0e1153a5962f78ed50207 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:37:30 +0000 Subject: [PATCH 15/15] New compile option --- systems/Aurora/sourceme.sh | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index b43b3b71..d577ae96 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,28 +1,23 @@ +source ~/spack/share/spack/setup-env.sh +spack load c-lime + +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` +#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH + +export INTELGT_AUTO_ATTACH_DISABLE=1 + #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -module load oneapi/release/2023.12.15.001 - -#module use /soft/modulefiles -#module load intel_compute_runtime/release/agama-devel-682.22 - -export FI_CXI_DEFAULT_CQ_SIZE=131072 -export FI_CXI_CQ_FILL_PERCENT=20 - -export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" - -# # -ftarget-register-alloc-mode=pvc:default # -ftarget-register-alloc-mode=pvc:small # -ftarget-register-alloc-mode=pvc:large # -ftarget-register-alloc-mode=pvc:auto -# +#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 -#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 git config --global http.proxy http://proxy.alcf.anl.gov:3128 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"