From 2877fb4a2c5717b8ff2100a68547e532de91ce53 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 18 Jun 2024 03:21:03 +0000 Subject: [PATCH 01/22] More verbose if alloc failure --- Grid/allocator/AlignedAllocator.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 8a27f527..293ce2fb 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -54,6 +54,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } @@ -100,6 +103,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } @@ -145,6 +151,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } From 63c223ea5de95dd694c12bb228be82b1e7bfff98 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 18 Jun 2024 03:22:01 +0000 Subject: [PATCH 02/22] Verbose --- Grid/qcd/hmc/GenericHMCrunner.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index 1429d848..b53755aa 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -90,6 +90,7 @@ public: exit(1); } Parameters.StartingType = arg; + std::cout < ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.StartTrajectory = ivec[0]; + std::cout < ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.Trajectories = ivec[0]; + std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "< ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.NoMetropolisUntil = ivec[0]; + std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "< Date: Tue, 18 Jun 2024 16:31:37 -0400 Subject: [PATCH 03/22] Best so far on 96^3 350 Evecs converged on 4^4 block --- .../Test_general_coarse_hdcg_phys96_mixed.cc | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc index 71ad42ba..cbf96992 100644 --- a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc +++ b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc @@ -160,7 +160,8 @@ int main (int argc, char ** argv) GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); // Construct a coarsened grid with 4^4 cell - Coordinate Block({4,4,6,6}); + // Coordinate Block({4,4,6,4}); + Coordinate Block({4,4,4,4}); Coordinate clatt = GridDefaultLatt(); for(int d=0;d MrhsHermMatrix; - Chebyshev IRLCheby(0.0012,42.0,301); // 1 iter + // Chebyshev IRLCheby(0.0012,42.0,301); // 4.4.6.4 + Chebyshev IRLCheby(0.0010,42.0,501); // for 4.4.4.4 blocking MrhsHermMatrix MrhsCoarseOp (mrhs); CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0); PowerMethod cPM; cPM(MrhsCoarseOp,pm_src); - int Nk=nrhs*30; + // int Nk=nrhs*30; // 4.4.6.4 // int Nk=nrhs*80; + int Nk=nrhs*60; int Nm=Nk*4; - int Nstop=Nk; + int Nstop=350; int Nconv_test_interval=1; ImplicitlyRestartedBlockLanczosCoarse IRL(MrhsCoarseOp, @@ -299,7 +302,7 @@ int main (int argc, char ** argv) nrhs, Nk, Nm, - 1e-4,20); + 3e-4,2); std::vector eval(Nm); std::vector evec(Nm,Coarse5d); @@ -368,7 +371,7 @@ int main (int argc, char ** argv) HDCGmrhs(src_mrhs,res_mrhs); // Standard CG -#if 0 +#if 1 { std::cout << "**************************************"< Date: Fri, 28 Jun 2024 16:02:29 +0000 Subject: [PATCH 04/22] Try catch exception wrappers --- Grid/Namespace.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/Namespace.h b/Grid/Namespace.h index 29b229fa..0d93ee9f 100644 --- a/Grid/Namespace.h +++ b/Grid/Namespace.h @@ -30,9 +30,13 @@ directory #include #include +#include #define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_END(A) } #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) #define GRID_NAMESPACE_END NAMESPACE_END(Grid) #define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same::value,"Not in :: at" ); + +#define EXCEPTION_CHECK_BEGIN(A) try { +#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "< Date: Fri, 28 Jun 2024 16:05:17 +0000 Subject: [PATCH 05/22] Display linux heap info --- Grid/allocator/MemoryManager.cc | 23 ++++++++++++++++++++++- Grid/allocator/MemoryManager.h | 1 + 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index a9e5c9b4..c71f3512 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -16,6 +16,27 @@ NAMESPACE_BEGIN(Grid); uint64_t total_shared; uint64_t total_device; uint64_t total_host;; + +void MemoryManager::DisplayMallinfo(void) +{ +#ifdef __linux__ + struct mallinfo mi; + + mi = mallinfo(); + + printf("Total non-mmapped bytes (arena): %d\n", mi.arena); + printf("# of free chunks (ordblks): %d\n", mi.ordblks); + printf("# of free fastbin blocks (smblks): %d\n", mi.smblks); + printf("# of mapped regions (hblks): %d\n", mi.hblks); + printf("Bytes in mapped regions (hblkhd): %d\n", mi.hblkhd); + printf("Max. total allocated space (usmblks): %d\n", mi.usmblks); + printf("Free bytes held in fastbins (fsmblks): %d\n", mi.fsmblks); + printf("Total allocated space (uordblks): %d\n", mi.uordblks); + printf("Total free space (fordblks): %d\n", mi.fordblks); + printf("Topmost releasable block (keepcost): %d\n", mi.keepcost); +#endif +} + void MemoryManager::PrintBytes(void) { std::cout << " MemoryManager : ------------------------------------ "< Date: Fri, 5 Jul 2024 17:46:43 +0000 Subject: [PATCH 06/22] Sanitizer compile options --- systems/Aurora/config-command-sanitize | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 systems/Aurora/config-command-sanitize diff --git a/systems/Aurora/config-command-sanitize b/systems/Aurora/config-command-sanitize new file mode 100644 index 00000000..d400a103 --- /dev/null +++ b/systems/Aurora/config-command-sanitize @@ -0,0 +1,22 @@ +# -fsycl-targets=spir64_gen -Xs\" -device pvc \" +# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc +# -fsycl-targets=intel_gpu_pvc + +unset DEVICE +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=address" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -Xarch_host -fsanitize=address -fsycl-targets=spir64_gen -Xs -device -Xs pvc " +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-debug \ + --disable-gparity \ + --disable-fermion-reps \ + --with-lime=$CLIME \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-accelerator-aware-mpi=yes\ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx + From 89fdd7f8dd56843b6e3b022569f6d59f1c09d9e8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 5 Jul 2024 17:47:56 +0000 Subject: [PATCH 07/22] AOT compilation --- systems/Aurora/config-command | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 58eb8a03..f538f319 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,16 +1,18 @@ +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ + --enable-debug \ --disable-gparity \ --disable-fermion-reps \ + --with-lime=$CLIME \ --enable-shm=nvlink \ --enable-accelerator=sycl \ --enable-accelerator-aware-mpi=yes\ --enable-unified=no \ MPICXX=mpicxx \ - CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" + CXX=icpx From 172c75029ed84bdebc0667380819f3459f38cb54 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 10 Jul 2024 22:03:59 +0000 Subject: [PATCH 08/22] Redblack additional case --- Grid/algorithms/iterative/SchurRedBlack.h | 81 +++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h index d97e4993..494aa77b 100644 --- a/Grid/algorithms/iterative/SchurRedBlack.h +++ b/Grid/algorithms/iterative/SchurRedBlack.h @@ -499,6 +499,87 @@ namespace Grid { } }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// + // Site diagonal is identity, left preconditioned by Mee^inv + // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta + // + // Solve: + // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta + // + // Old notation e<->o + // + // Left precon by Moo^-1 + // b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o + // eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e) + /////////////////////////////////////////////////////////////////////////////////////////////////////// + template class SchurRedBlackDiagOneSolve : public SchurRedBlackBase { + public: + typedef CheckerBoardedSparseMatrixBase Matrix; + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations Schur trick + ///////////////////////////////////////////////////// + SchurRedBlackDiagOneSolve(OperatorFunction &HermitianRBSolver, const bool initSubGuess = false, + const bool _solnAsInitGuess = false) + : SchurRedBlackBase(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {}; + + virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) + { + GridBase *grid = _Matrix.RedBlackGrid(); + GridBase *fgrid= _Matrix.Grid(); + + SchurDiagOneOperator _HermOpEO(_Matrix); + + Field tmp(grid); + Field Mtmp(grid); + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd ,src_o,src); + + ///////////////////////////////////////////////////// + // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) + ///////////////////////////////////////////////////// + _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); + _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); + Mtmp=src_o-Mtmp; + _Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd); + + // get the right MpcDag + _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); + } + + virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) + { + GridBase *grid = _Matrix.RedBlackGrid(); + GridBase *fgrid= _Matrix.Grid(); + + Field tmp(grid); + Field sol_e(grid); + + + /////////////////////////////////////////////////// + // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... + /////////////////////////////////////////////////// + _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); + tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); + _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); + + setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); + setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); + }; + + virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) + { + SchurDiagOneOperator _HermOpEO(_Matrix); + this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); + }; + virtual void RedBlackSolve (Matrix & _Matrix,const std::vector &src_o, std::vector &sol_o) + { + SchurDiagOneOperator _HermOpEO(_Matrix); + this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); + } + }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// // Site diagonal is identity, right preconditioned by Mee^inv // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta From 7c246606c19957ff9937a6280fd1224bbfca0118 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 10 Jul 2024 22:04:32 +0000 Subject: [PATCH 09/22] Schur additional case --- tests/debug/Test_cayley_cg.cc | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc index 74492fd9..068c260f 100644 --- a/tests/debug/Test_cayley_cg.cc +++ b/tests/debug/Test_cayley_cg.cc @@ -392,9 +392,27 @@ void TestCGschur(What & Ddwf, GridParallelRNG *RNG5) { LatticeFermion src (FGrid); random(*RNG5,src); - LatticeFermion result(FGrid); result=Zero(); + LatticeFermion result1(FGrid); result1=Zero(); + LatticeFermion result2(FGrid); result2=Zero(); + LatticeFermion result3(FGrid); result3=Zero(); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackDiagMooeeSolve SchurSolver(CG); - SchurSolver(Ddwf,src,result); + SchurSolver(Ddwf,src,result1); + + SchurRedBlackDiagOneSolve SchurSolverSymm1(CG); + SchurSolverSymm1(Ddwf,src,result2); + + SchurRedBlackDiagTwoSolve SchurSolverSymm2(CG); + SchurSolverSymm2(Ddwf,src,result3); + + std::cout << GridLogMessage << " Standard " < Date: Thu, 11 Jul 2024 15:19:19 +0000 Subject: [PATCH 10/22] Namespace addition --- Grid/Namespace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/Namespace.h b/Grid/Namespace.h index 0d93ee9f..c42b46b3 100644 --- a/Grid/Namespace.h +++ b/Grid/Namespace.h @@ -40,3 +40,4 @@ directory #define EXCEPTION_CHECK_BEGIN(A) try { #define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "< Date: Thu, 11 Jul 2024 15:19:49 +0000 Subject: [PATCH 11/22] Batched blas, but not working yet on OneAPI --- Grid/algorithms/blas/BatchedBlas.h | 120 +++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 23 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index a7edb485..22353d49 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -89,9 +89,10 @@ public: gridblasHandle = theGridAccelerator; #endif #ifdef GRID_ONE_MKL - cl::sycl::cpu_selector selector; + cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; - gridblasHandle =new sycl::queue (selectedDevice); + cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; + gridblasHandle =new sycl::queue (selectedDevice,q_prop); #endif gridblasInit=1; } @@ -266,8 +267,46 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + std::cerr << " Calling SYCL batched ZGEMM "<()); + synchronise(); + std::cerr << " Called SYCL batched ZGEMM "< A(m*k); // pointer list to matrices + std::vector B(k*n); + std::vector C(m*n); + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; + for (int p = 0; p < 1; ++p) { + acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD)); + acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD)); + acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD)); + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k; ++kk) + c_mn += A[mm + kk*lda ] * B[kk + nn*ldb]; + std::cout << " beta "<gemm_batch & OneAPI -#warning "oneMKL implementation not built " + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (ComplexF *) &alpha_p[0], + (const ComplexF **)&Amk[0], (const int64_t *)&lda64, + (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, + (ComplexF *) &beta_p[0], + (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -467,8 +522,25 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (float *) &alpha_p[0], + (const float **)&Amk[0], (const int64_t *)&lda64, + (const float **)&Bkn[0], (const int64_t *)&ldb64, + (float *) &beta_p[0], + (float **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -568,24 +640,25 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - /* int64_t m64=m; int64_t n64=n; int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; int64_t batchCount64=batchCount; - oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, - onemkl::transpose::N, - onemkl::transpose::N, - &m64,&n64,&k64, - (double *) &alpha_p[0], - (double **)&Amk[0], lda, - (double **)&Bkn[0], ldb, - (double *) &beta_p[0], - (double **)&Cmn[0], ldc, - 1,&batchCount64); - */ - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (double *) &alpha_p[0], + (const double **)&Amk[0], (const int64_t *)&lda64, + (const double **)&Bkn[0], (const int64_t *)&ldb64, + (double *) &beta_p[0], + (double **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -673,6 +746,7 @@ public: beta, (ComplexD *)Cmn,ldc,sdc, batchCount); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) // Need a default/reference implementation From 4502a8c8a185cec95ff0ff61c6d9122fb915ee34 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:22:18 +0000 Subject: [PATCH 12/22] libc malloc heap info dump on Linux --- Grid/allocator/MemoryManager.cc | 37 ++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index c71f3512..e56af238 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -17,6 +17,21 @@ uint64_t total_shared; uint64_t total_device; uint64_t total_host;; +#if defined(__has_feature) +#if __has_feature(leak_sanitizer) +#define ASAN_LEAK_CHECK +#endif +#endif + +#ifdef ASAN_LEAK_CHECK +#include +#include +#include +#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); } +#else +#define LEAK_CHECK(A) { } +#endif + void MemoryManager::DisplayMallinfo(void) { #ifdef __linux__ @@ -24,17 +39,19 @@ void MemoryManager::DisplayMallinfo(void) mi = mallinfo(); - printf("Total non-mmapped bytes (arena): %d\n", mi.arena); - printf("# of free chunks (ordblks): %d\n", mi.ordblks); - printf("# of free fastbin blocks (smblks): %d\n", mi.smblks); - printf("# of mapped regions (hblks): %d\n", mi.hblks); - printf("Bytes in mapped regions (hblkhd): %d\n", mi.hblkhd); - printf("Max. total allocated space (usmblks): %d\n", mi.usmblks); - printf("Free bytes held in fastbins (fsmblks): %d\n", mi.fsmblks); - printf("Total allocated space (uordblks): %d\n", mi.uordblks); - printf("Total free space (fordblks): %d\n", mi.fordblks); - printf("Topmost releasable block (keepcost): %d\n", mi.keepcost); + std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena< Date: Thu, 11 Jul 2024 15:24:11 +0000 Subject: [PATCH 13/22] Device vector not UVM --- Grid/lattice/Lattice_reduction.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 7b66c31d..6ce70232 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -373,7 +373,8 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt nrm = real(TensorRemove(sum(inner_tmp_v,sites))); #else typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); + deviceVector inner_tmp; + inner_tmp.resize(sites); auto inner_tmp_v = &inner_tmp[0]; accelerator_for( ss, sites, nsimd,{ From cf3584ad15dab0a7b949a1f463ce6b80db6580c6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:30:32 +0000 Subject: [PATCH 14/22] Convenient to monitor memory across an HMC trajectory --- Grid/qcd/hmc/integrators/Integrator.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 0276b6fd..549920a0 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -137,9 +137,11 @@ public: double start_force = usecond(); + MemoryManager::Print(); as[level].actions.at(a)->deriv_timer_start(); as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); + MemoryManager::Print(); auto name = as[level].actions.at(a)->action_name(); @@ -246,7 +248,11 @@ public: } }; - virtual ~Integrator() {} + virtual ~Integrator() + { + // Pain in the ass to clean up the Level pointers + // Guido's design is at fault as per comment above in constructor + } virtual std::string integrator_name() = 0; @@ -460,6 +466,7 @@ public: for (int level = 0; level < as.size(); ++level) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { + MemoryManager::Print(); // get gauge field from the SmearingPolicy and // based on the boolean is_smeared in actionID std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; @@ -468,6 +475,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; + MemoryManager::Print(); } as[level].apply(S_hireps, Representations, level, H); From b3ee8ded969351f2d9a36d9589c5b3dd458eed23 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:34:48 +0000 Subject: [PATCH 15/22] Respect command line --- HMC/Mobius2p1f.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc index 4ab1f20f..8042d6e6 100644 --- a/HMC/Mobius2p1f.cc +++ b/HMC/Mobius2p1f.cc @@ -58,7 +58,7 @@ int main(int argc, char **argv) { HMCparameters HMCparams; HMCparams.StartTrajectory = 0; HMCparams.Trajectories = 200; - HMCparams.NoMetropolisUntil= 20; + HMCparams.NoMetropolisUntil= 0; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; HMCparams.StartingType =std::string("ColdStart"); HMCparams.MD = MD; @@ -70,7 +70,7 @@ int main(int argc, char **argv) { CheckpointerParameters CPparams; CPparams.config_prefix = "ckpoint_EODWF_lat"; CPparams.rng_prefix = "ckpoint_EODWF_rng"; - CPparams.saveInterval = 10; + CPparams.saveInterval = 1; CPparams.format = "IEEE64BIG"; TheHMC.Resources.LoadNerscCheckpointer(CPparams); @@ -186,6 +186,8 @@ int main(int argc, char **argv) { ///////////////////////////////////////////////////////////// // HMC parameters are serialisable + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); std::cout << GridLogMessage << " Running the HMC "<< std::endl; TheHMC.Run(); // no smearing From 059e8e5bb0b4334ba3e0e1153a5962f78ed50207 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Jul 2024 15:37:30 +0000 Subject: [PATCH 16/22] New compile option --- systems/Aurora/sourceme.sh | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index b43b3b71..d577ae96 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,28 +1,23 @@ +source ~/spack/share/spack/setup-env.sh +spack load c-lime + +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` +#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH + +export INTELGT_AUTO_ATTACH_DISABLE=1 + #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -module load oneapi/release/2023.12.15.001 - -#module use /soft/modulefiles -#module load intel_compute_runtime/release/agama-devel-682.22 - -export FI_CXI_DEFAULT_CQ_SIZE=131072 -export FI_CXI_CQ_FILL_PERCENT=20 - -export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" - -# # -ftarget-register-alloc-mode=pvc:default # -ftarget-register-alloc-mode=pvc:small # -ftarget-register-alloc-mode=pvc:large # -ftarget-register-alloc-mode=pvc:auto -# +#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 -#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 git config --global http.proxy http://proxy.alcf.anl.gov:3128 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" From 804d9367d40b93d7d68e6cca7348d1b4d96c3ac1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Jul 2024 15:23:25 -0400 Subject: [PATCH 17/22] Regressed performance --- tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc b/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc index c464b25b..054b4c66 100644 --- a/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc +++ b/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc @@ -145,7 +145,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); const int Ls=24; - const int nbasis = 60; + const int nbasis = 62; const int cb = 0 ; RealD mass=0.00078; RealD M5=1.8; @@ -160,7 +160,7 @@ int main (int argc, char ** argv) GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); // Construct a coarsened grid with 4^4 cell - Coordinate Block({4,4,4,4}); + Coordinate Block({4,4,6,4}); Coordinate clatt = GridDefaultLatt(); for(int d=0;d Date: Mon, 22 Jul 2024 15:24:04 -0400 Subject: [PATCH 18/22] Regressed performance for paper --- .../Test_general_coarse_hdcg_phys96_mixed.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc index cbf96992..c45b2cb1 100644 --- a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc +++ b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc @@ -278,7 +278,13 @@ int main (int argc, char ** argv) typedef HermitianLinearOperator MrhsHermMatrix; // Chebyshev IRLCheby(0.0012,42.0,301); // 4.4.6.4 - Chebyshev IRLCheby(0.0010,42.0,501); // for 4.4.4.4 blocking + // Chebyshev IRLCheby(0.0012,42.0,501); // for 4.4.4.4 blocking 350 evs + // Chebyshev IRLCheby(0.0014,42.0,501); // for 4.4.4.4 blocking 700 evs + // Chebyshev IRLCheby(0.002,42.0,501); // for 4.4.4.4 blocking 1226 evs + // Chebyshev IRLCheby(0.0025,42.0,501); // for 4.4.4.4 blocking 1059 evs + // 3e-4,2); + Chebyshev IRLCheby(0.0018,42.0,301); // for 4.4.4.4 blocking // 790 evs + MrhsHermMatrix MrhsCoarseOp (mrhs); CoarseVector pm_src(CoarseMrhs); @@ -287,9 +293,9 @@ int main (int argc, char ** argv) // int Nk=nrhs*30; // 4.4.6.4 // int Nk=nrhs*80; - int Nk=nrhs*60; - int Nm=Nk*4; - int Nstop=350; + int Nk=nrhs*60; // 720 + int Nm=Nk*4; // 2880 ; generally finishes at 1440 + int Nstop=512; int Nconv_test_interval=1; ImplicitlyRestartedBlockLanczosCoarse IRL(MrhsCoarseOp, @@ -334,7 +340,7 @@ int main (int argc, char ** argv) // Extra HDCG parameters ////////////////////////// int maxit=3000; - ConjugateGradient CG(5.0e-2,maxit,false); + ConjugateGradient CG(7.5e-2,maxit,false); RealD lo=2.0; int ord = 7; @@ -371,7 +377,7 @@ int main (int argc, char ** argv) HDCGmrhs(src_mrhs,res_mrhs); // Standard CG -#if 1 +#if 0 { std::cout << "**************************************"< Date: Mon, 22 Jul 2024 15:24:56 -0400 Subject: [PATCH 19/22] Force compile temporarily --- tests/debug/Test_general_coarse.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/debug/Test_general_coarse.cc b/tests/debug/Test_general_coarse.cc index 28130a3c..4351a901 100644 --- a/tests/debug/Test_general_coarse.cc +++ b/tests/debug/Test_general_coarse.cc @@ -244,7 +244,7 @@ int main (int argc, char ** argv) GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); - +#if 0 MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; @@ -307,7 +307,8 @@ int main (int argc, char ** argv) rh_res= Zero(); mrhsCG(MrhsCoarseOp,rh_src,rh_res); } - + +#endif std::cout< Date: Mon, 22 Jul 2024 15:25:17 -0400 Subject: [PATCH 20/22] 8^4 test for PETSc --- tests/debug/Test_8888.cc | 118 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 tests/debug/Test_8888.cc diff --git a/tests/debug/Test_8888.cc b/tests/debug/Test_8888.cc new file mode 100644 index 00000000..279bc8bd --- /dev/null +++ b/tests/debug/Test_8888.cc @@ -0,0 +1,118 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_general_coarse_hdcg.cc + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=8; + const int nbasis = 40; + const int cb = 0 ; + RealD mass=0.01; + RealD M5=1.8; + RealD b=1.0; + RealD c=0.0; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + ///////////////////////// RNGs ///////////////////////////////// + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + ///////////////////////// Configuration ///////////////////////////////// + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("ckpoint_EODWF_lat.125"); + NerscIO::readConfiguration(Umu,header,file); + + //////////////////////// Fermion action ////////////////////////////////// + MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + MdagMLinearOperator HermOp(Ddwf); + + + std::cout << "**************************************"< fPM; + fPM(HermOp,pm_src); + + + std::cout << "**************************************"< IRLChebyLo(0.2,64.0,201); // 1 iter + Chebyshev IRLChebyLo(0.0,55.0,101); // 1 iter + FunctionHermOp PolyOp(IRLChebyLo,HermOp); + PlainHermOp Op(HermOp); + + ImplicitlyRestartedLanczos IRL(PolyOp, + Op, + Nk, // sought vecs + Nk, // sought vecs + Nm, // spare vecs + 1.0e-8, + 10 // Max iterations + ); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,FGrid); + LatticeFermionD irl_src(FGrid); + + IRL.calc(eval,evec,irl_src,Nconv); + + Grid_finalize(); + return 0; +} From c9d5674d5be268276ed58bddcd4534216a5f1042 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Jul 2024 15:26:45 -0400 Subject: [PATCH 21/22] FInal for paper --- .../ImplicitlyRestartedBlockLanczosCoarse.h | 58 +++++++++++-------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h index 8168aa9d..66d2812d 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h @@ -279,11 +279,11 @@ public: Qt = Eigen::MatrixXcd::Identity(Nm,Nm); diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); _sort.push(eval2,Nm); - // Glog << "#Ritz value before shift: "<< std::endl; + Glog << "#Ritz value before shift: "<< std::endl; for(int i=0; i Btmp(Nstop,grid); // waste of space replicating @@ -642,7 +644,7 @@ private: // for (int u=0; u0) { for (int u=0; u Date: Tue, 23 Jul 2024 09:53:08 -0400 Subject: [PATCH 22/22] New Frontier config --- systems/Frontier/sourceme.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh index 652e5b45..a6f49d8c 100644 --- a/systems/Frontier/sourceme.sh +++ b/systems/Frontier/sourceme.sh @@ -3,7 +3,7 @@ spack load c-lime module load emacs module load PrgEnv-gnu module load rocm -module load cray-mpich/8.1.23 +module load cray-mpich module load gmp module load cray-fftw module load craype-accel-amd-gfx90a