diff --git a/Grid/Namespace.h b/Grid/Namespace.h index 29b229fa..c42b46b3 100644 --- a/Grid/Namespace.h +++ b/Grid/Namespace.h @@ -30,9 +30,14 @@ directory #include #include +#include #define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_END(A) } #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) #define GRID_NAMESPACE_END NAMESPACE_END(Grid) #define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same::value,"Not in :: at" ); + +#define EXCEPTION_CHECK_BEGIN(A) try { +#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<gemm_batch & OneAPI -#warning "oneMKL implementation not built " + std::cerr << " Calling SYCL batched ZGEMM "<()); + synchronise(); + std::cerr << " Called SYCL batched ZGEMM "< A(m*k); // pointer list to matrices + std::vector B(k*n); + std::vector C(m*n); + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; + for (int p = 0; p < 1; ++p) { + acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD)); + acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD)); + acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD)); + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k; ++kk) + c_mn += A[mm + kk*lda ] * B[kk + nn*ldb]; + std::cout << " beta "<gemm_batch & OneAPI -#warning "oneMKL implementation not built " + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (ComplexF *) &alpha_p[0], + (const ComplexF **)&Amk[0], (const int64_t *)&lda64, + (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, + (ComplexF *) &beta_p[0], + (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -467,8 +522,25 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (float *) &alpha_p[0], + (const float **)&Amk[0], (const int64_t *)&lda64, + (const float **)&Bkn[0], (const int64_t *)&ldb64, + (float *) &beta_p[0], + (float **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -568,24 +640,25 @@ public: assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - /* int64_t m64=m; int64_t n64=n; int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; int64_t batchCount64=batchCount; - oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, - onemkl::transpose::N, - onemkl::transpose::N, - &m64,&n64,&k64, - (double *) &alpha_p[0], - (double **)&Amk[0], lda, - (double **)&Bkn[0], ldb, - (double *) &beta_p[0], - (double **)&Cmn[0], ldc, - 1,&batchCount64); - */ - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " + oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N; + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + ¬ransp, + ¬ransp, + &m64,&n64,&k64, + (double *) &alpha_p[0], + (const double **)&Amk[0], (const int64_t *)&lda64, + (const double **)&Bkn[0], (const int64_t *)&ldb64, + (double *) &beta_p[0], + (double **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int sda = lda*k; @@ -673,6 +746,7 @@ public: beta, (ComplexD *)Cmn,ldc,sdc, batchCount); + synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) // Need a default/reference implementation diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h index d97e4993..494aa77b 100644 --- a/Grid/algorithms/iterative/SchurRedBlack.h +++ b/Grid/algorithms/iterative/SchurRedBlack.h @@ -499,6 +499,87 @@ namespace Grid { } }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// + // Site diagonal is identity, left preconditioned by Mee^inv + // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta + // + // Solve: + // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta + // + // Old notation e<->o + // + // Left precon by Moo^-1 + // b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o + // eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e) + /////////////////////////////////////////////////////////////////////////////////////////////////////// + template class SchurRedBlackDiagOneSolve : public SchurRedBlackBase { + public: + typedef CheckerBoardedSparseMatrixBase Matrix; + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations Schur trick + ///////////////////////////////////////////////////// + SchurRedBlackDiagOneSolve(OperatorFunction &HermitianRBSolver, const bool initSubGuess = false, + const bool _solnAsInitGuess = false) + : SchurRedBlackBase(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {}; + + virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) + { + GridBase *grid = _Matrix.RedBlackGrid(); + GridBase *fgrid= _Matrix.Grid(); + + SchurDiagOneOperator _HermOpEO(_Matrix); + + Field tmp(grid); + Field Mtmp(grid); + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd ,src_o,src); + + ///////////////////////////////////////////////////// + // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) + ///////////////////////////////////////////////////// + _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); + _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); + Mtmp=src_o-Mtmp; + _Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd); + + // get the right MpcDag + _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); + } + + virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) + { + GridBase *grid = _Matrix.RedBlackGrid(); + GridBase *fgrid= _Matrix.Grid(); + + Field tmp(grid); + Field sol_e(grid); + + + /////////////////////////////////////////////////// + // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... + /////////////////////////////////////////////////// + _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); + tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); + _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); + + setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); + setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); + }; + + virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) + { + SchurDiagOneOperator _HermOpEO(_Matrix); + this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); + }; + virtual void RedBlackSolve (Matrix & _Matrix,const std::vector &src_o, std::vector &sol_o) + { + SchurDiagOneOperator _HermOpEO(_Matrix); + this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); + } + }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// // Site diagonal is identity, right preconditioned by Mee^inv // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 8a27f527..293ce2fb 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -54,6 +54,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } @@ -100,6 +103,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } @@ -145,6 +151,9 @@ public: size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); return ptr; } diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index a9e5c9b4..e56af238 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid); uint64_t total_shared; uint64_t total_device; uint64_t total_host;; + +#if defined(__has_feature) +#if __has_feature(leak_sanitizer) +#define ASAN_LEAK_CHECK +#endif +#endif + +#ifdef ASAN_LEAK_CHECK +#include +#include +#include +#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); } +#else +#define LEAK_CHECK(A) { } +#endif + +void MemoryManager::DisplayMallinfo(void) +{ +#ifdef __linux__ + struct mallinfo mi; + + mi = mallinfo(); + + std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena< &z,sobj a,sobj b,const Lattice &x,const Latt nrm = real(TensorRemove(sum(inner_tmp_v,sites))); #else typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); + deviceVector inner_tmp; + inner_tmp.resize(sites); auto inner_tmp_v = &inner_tmp[0]; accelerator_for( ss, sites, nsimd,{ diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index 1429d848..b53755aa 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -90,6 +90,7 @@ public: exit(1); } Parameters.StartingType = arg; + std::cout < ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.StartTrajectory = ivec[0]; + std::cout < ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.Trajectories = ivec[0]; + std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "< ivec(0); GridCmdOptionIntVector(arg, ivec); Parameters.NoMetropolisUntil = ivec[0]; + std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<deriv_timer_start(); as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); + MemoryManager::Print(); auto name = as[level].actions.at(a)->action_name(); @@ -246,7 +248,11 @@ public: } }; - virtual ~Integrator() {} + virtual ~Integrator() + { + // Pain in the ass to clean up the Level pointers + // Guido's design is at fault as per comment above in constructor + } virtual std::string integrator_name() = 0; @@ -460,6 +466,7 @@ public: for (int level = 0; level < as.size(); ++level) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { + MemoryManager::Print(); // get gauge field from the SmearingPolicy and // based on the boolean is_smeared in actionID std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; @@ -468,6 +475,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; + MemoryManager::Print(); } as[level].apply(S_hireps, Representations, level, H); diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc index 4ab1f20f..8042d6e6 100644 --- a/HMC/Mobius2p1f.cc +++ b/HMC/Mobius2p1f.cc @@ -58,7 +58,7 @@ int main(int argc, char **argv) { HMCparameters HMCparams; HMCparams.StartTrajectory = 0; HMCparams.Trajectories = 200; - HMCparams.NoMetropolisUntil= 20; + HMCparams.NoMetropolisUntil= 0; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; HMCparams.StartingType =std::string("ColdStart"); HMCparams.MD = MD; @@ -70,7 +70,7 @@ int main(int argc, char **argv) { CheckpointerParameters CPparams; CPparams.config_prefix = "ckpoint_EODWF_lat"; CPparams.rng_prefix = "ckpoint_EODWF_rng"; - CPparams.saveInterval = 10; + CPparams.saveInterval = 1; CPparams.format = "IEEE64BIG"; TheHMC.Resources.LoadNerscCheckpointer(CPparams); @@ -186,6 +186,8 @@ int main(int argc, char **argv) { ///////////////////////////////////////////////////////////// // HMC parameters are serialisable + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); std::cout << GridLogMessage << " Running the HMC "<< std::endl; TheHMC.Run(); // no smearing diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 58eb8a03..f538f319 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,16 +1,18 @@ +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ + --enable-debug \ --disable-gparity \ --disable-fermion-reps \ + --with-lime=$CLIME \ --enable-shm=nvlink \ --enable-accelerator=sycl \ --enable-accelerator-aware-mpi=yes\ --enable-unified=no \ MPICXX=mpicxx \ - CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" + CXX=icpx diff --git a/systems/Aurora/config-command-sanitize b/systems/Aurora/config-command-sanitize new file mode 100644 index 00000000..d400a103 --- /dev/null +++ b/systems/Aurora/config-command-sanitize @@ -0,0 +1,22 @@ +# -fsycl-targets=spir64_gen -Xs\" -device pvc \" +# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc +# -fsycl-targets=intel_gpu_pvc + +unset DEVICE +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=address" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -Xarch_host -fsanitize=address -fsycl-targets=spir64_gen -Xs -device -Xs pvc " +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-debug \ + --disable-gparity \ + --disable-fermion-reps \ + --with-lime=$CLIME \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-accelerator-aware-mpi=yes\ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx + diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index b43b3b71..d577ae96 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,28 +1,23 @@ +source ~/spack/share/spack/setup-env.sh +spack load c-lime + +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` +#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH + +export INTELGT_AUTO_ATTACH_DISABLE=1 + #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -module load oneapi/release/2023.12.15.001 - -#module use /soft/modulefiles -#module load intel_compute_runtime/release/agama-devel-682.22 - -export FI_CXI_DEFAULT_CQ_SIZE=131072 -export FI_CXI_CQ_FILL_PERCENT=20 - -export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" - -# # -ftarget-register-alloc-mode=pvc:default # -ftarget-register-alloc-mode=pvc:small # -ftarget-register-alloc-mode=pvc:large # -ftarget-register-alloc-mode=pvc:auto -# +#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 -#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 git config --global http.proxy http://proxy.alcf.anl.gov:3128 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc index 74492fd9..068c260f 100644 --- a/tests/debug/Test_cayley_cg.cc +++ b/tests/debug/Test_cayley_cg.cc @@ -392,9 +392,27 @@ void TestCGschur(What & Ddwf, GridParallelRNG *RNG5) { LatticeFermion src (FGrid); random(*RNG5,src); - LatticeFermion result(FGrid); result=Zero(); + LatticeFermion result1(FGrid); result1=Zero(); + LatticeFermion result2(FGrid); result2=Zero(); + LatticeFermion result3(FGrid); result3=Zero(); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackDiagMooeeSolve SchurSolver(CG); - SchurSolver(Ddwf,src,result); + SchurSolver(Ddwf,src,result1); + + SchurRedBlackDiagOneSolve SchurSolverSymm1(CG); + SchurSolverSymm1(Ddwf,src,result2); + + SchurRedBlackDiagTwoSolve SchurSolverSymm2(CG); + SchurSolverSymm2(Ddwf,src,result3); + + std::cout << GridLogMessage << " Standard " <