diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h index 7cc11653..cd6bd682 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h @@ -110,7 +110,7 @@ private: std::string cname = std::string("ImplicitlyRestartedBlockLanczos"); int MaxIter; // Max iterations int Nstop; // Number of evecs checked for convergence - int Nu; // Numbeer of vecs in the unit block + int Nu; // Number of vecs in the unit block int Nk; // Number of converged sought int Nm; // total number of vectors int Nblock_k; // Nk/Nu @@ -129,7 +129,15 @@ private: GridRedBlackCartesian * f_grid; GridRedBlackCartesian * sf_grid; int mrhs; - + ///////////////////////// + // BLAS objects + ///////////////////////// +#ifdef GRID_NVCC + cudaError_t cudaStat; + cuDoubleComplex *w_acc, *evec_acc, *c_acc; +#endif + int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc + ///////////////////////// // Constructor ///////////////////////// @@ -154,7 +162,8 @@ public: Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu), //eresid(_eresid), MaxIter(10), eresid(_eresid), MaxIter(_MaxIter), - diagonalisation(_diagonalisation),split_test(0) + diagonalisation(_diagonalisation),split_test(0), + Nevec_acc(_Nu) { assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; //////////////////////////////// @@ -225,7 +234,7 @@ public: } - void orthogonalize_blas(std::vector& w, int _Nu, std::vector& evec, int _R, int _print=0) + void orthogonalize_blas(std::vector& w, std::vector& evec, int R, int do_print=0) { #ifdef GRID_NVCC Glog << "cuBLAS orthogonalize" << std::endl; @@ -237,105 +246,86 @@ public: typedef typename Field::scalar_type MyComplex; GridBase *grid = w[0].Grid(); - //grid->show_decomposition(); - //const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->lSites(); - - //auto w_v = w[0].View(); - //cuDoubleComplex *z = reinterpret_cast(&w_v._odata[0]); - //cuDoubleComplex *z = w_v._odata._internal; - //thread_for(ss,w_v.size(),{ - // Glog << w_v[ss] << std::endl; - //}); - //w_v[0] - //exit(0); - //scalar_type *z = (scalar_type *)&w_v[0]; // OK - //cuDoubleComplex *z = reinterpret_cast(&w_v[0]); // OK - - cudaError_t cudaStat; + + int Nbatch = R/Nevec_acc; + assert( R%Nevec_acc == 0 ); + Glog << "nBatch, Nevec_acc, R, Nu = " + << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; - cuDoubleComplex *w_acc, *evec_acc, *c_acc; - - cudaStat = cudaMallocManaged((void **)&w_acc, _Nu*sites*12*sizeof(cuDoubleComplex)); - Glog << cudaStat << std::endl; - cudaStat = cudaMallocManaged((void **)&evec_acc, _R*sites*12*sizeof(cuDoubleComplex)); - Glog << cudaStat << std::endl; - cudaStat = cudaMallocManaged((void **)&c_acc, _Nu*_R*12*sizeof(cuDoubleComplex)); - Glog << cudaStat << std::endl; - - Glog << "cuBLAS prepare array"<< std::endl; #if 0 // a trivial test - for (int col=0; col<_Nu; ++col) { + for (int col=0; col(&w_v[0]); for (size_t row=0; row(&evec_v[0]); - for (size_t row=0; rowGlobalSumVector((double*)c_acc,2*_Nu*_R); - cublasDestroy(handle); + Glog << "cuBLAS Zgemm"<< std::endl; + + for (int b=0; b(&evec_v[0]); + for (size_t row=0; rowGlobalSumVector((double*)c_acc,2*Nu*Nevec_acc); + + for (int i=0; i[" << j << "," << i << "] = " + << z.x << " + i " << z.y << std::endl; + } + w[i] = w[i] - ip * evec[b*Nevec_acc+j]; + } + //assert(normalize(w[i],do_print)!=0); + } + } + + for (int i=0; i[" << j << "," << i << "] = " - << z.x << " + i " << z.y << std::endl; - } - w[i] = w[i] - ip * evec[j]; - } - assert(normalize(w[i],_print)!=0); - } + cublasDestroy(handle); - cudaFree(w_acc); - cudaFree(evec_acc); - cudaFree(c_acc); - Glog << "cuBLAS orthogonalize done" << std::endl; #else Glog << "BLAS wrapper is not implemented" << std::endl; @@ -411,6 +401,21 @@ for( int i =0;i& evec, const std::vector& src, int& Nconv, LanczosType Impl) { +#ifdef GRID_NVCC + GridBase *grid = src[0].Grid(); + grid->show_decomposition(); + + // set eigenvector buffers for the cuBLAS calls + //const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->lSites(); + + cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(cuDoubleComplex)); + //Glog << cudaStat << std::endl; + cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(cuDoubleComplex)); + //Glog << cudaStat << std::endl; + cudaStat = cudaMallocManaged((void **)&c_acc, Nu*Nevec_acc*sizeof(cuDoubleComplex)); + //Glog << cudaStat << std::endl; +#endif switch (Impl) { case LanczosType::irbl: calc_irbl(eval,evec,src,Nconv); @@ -420,6 +425,12 @@ for( int i =0;i& eval, @@ -919,8 +930,8 @@ if(!split_test){ orthogonalize(w[u],evec,R); } #else - //orthogonalize(w,Nu,evec,R); - orthogonalize_blas(w,Nu,evec,R); + //orthogonalize(w,Nu,evec,R); + orthogonalize_blas(w,evec,R); #endif // QR part for (int u=1; u