From d4dc5e0f4327f6af88f8172a908e09e366a4f3d4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Aug 2024 11:08:33 -0400 Subject: [PATCH] BlockCG linalg acceleratoin with BLAS --- .../deflation/MultiRHSBlockCGLinalg.h | 131 +++++++++++++++++- .../deflation/MultiRHSBlockProject.h | 8 +- Grid/algorithms/deflation/MultiRHSDeflation.h | 8 +- 3 files changed, 137 insertions(+), 10 deletions(-) diff --git a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h index 9db5313d..edb4bbda 100644 --- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h +++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h @@ -38,16 +38,25 @@ public: typedef typename Field::scalar_type scalar; typedef typename Field::scalar_object scalar_object; + typedef typename Field::vector_object vector_object; deviceVector BLAS_X; // nrhs x vol -- the sources deviceVector BLAS_Y; // nrhs x vol -- the result deviceVector BLAS_C; // nrhs x nrhs -- the coefficients + deviceVector BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer + deviceVector Xdip; + deviceVector Ydip; + deviceVector Cdip; MultiRHSBlockCGLinalg() {}; ~MultiRHSBlockCGLinalg(){ Deallocate(); }; void Deallocate(void) { + Xdip.resize(0); + Ydip.resize(0); + Cdip.resize(0); + BLAS_Cred.resize(0); BLAS_C.resize(0); BLAS_X.resize(0); BLAS_Y.resize(0); @@ -120,10 +129,10 @@ public: ///////////////////////////////////////// BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, vw,nrhs,nrhs, - ComplexD(1.0), + scalar(1.0), Xd, Cd, - ComplexD(0.0), // wipe out Y + scalar(0.0), // wipe out Y Yd); BLAS.synchronise(); RealD t3 = usecond(); @@ -144,6 +153,7 @@ public: void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector &X, const std::vector &Y) { +#if 0 int nrhs; GridBase *grid; uint64_t vol; @@ -242,7 +252,124 @@ public: std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<_rdimensions[0] * grid->_rdimensions[1]; + vol = grid->oSites()/rd0; + words = rd0*sizeof(vector_object)/sizeof(scalar); + int64_t vw = vol * words; + assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar)); + + RealD t0 = usecond(); + BLAS_X.resize(nrhs * vw); // cost free if size doesn't change + BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change + BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change + RealD t1 = usecond(); + + ///////////////////////////////////////////// + // Copy in the multi-rhs sources -- layout batched BLAS ready + ///////////////////////////////////////////// + for(int r=0;r Xh(vol); + std::vector Yh(vol); + std::vector Ch(vol); + for(uint64_t ss=0;ss HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients + acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar)); + + RealD t5 = usecond(); + m = Eigen::MatrixXcd::Zero(nrhs,nrhs); + for(int ss=0;ss eC((std::complex *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs); + m = m + eC; + } + RealD t6l = usecond(); + grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs); + RealD t6 = usecond(); + uint64_t M=nrhs; + uint64_t N=nrhs; + uint64_t K=vw; + RealD xybytes = grid->lSites()*sizeof(scalar_object); + RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K); + RealD flops = 8.0*M*N*K; + flops = flops/(t4-t3)/1.e3; + bytes = bytes/(t4-t3)/1.e3; + xybytes = 4*xybytes/(t2-t1)/1.e3; + std::cout << "InnerProductMatrix m,n,k "<< M<<","<