From dc50190b8f1d824757b8956001f0802ef9291132 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 10 Apr 2020 11:06:04 -0400 Subject: [PATCH] Faster GPU basis rotation May need to later include Regensburg optimised CPU variant --- .../iterative/ImplicitlyRestartedLanczos.h | 84 ++++++++++++++++--- 1 file changed, 74 insertions(+), 10 deletions(-) diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index d9c90085..8bee43cc 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -59,16 +59,15 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i { typedef decltype(basis[0].View()) View; auto tmp_v = basis[0].View(); - std::vector basis_v(basis.size(),tmp_v); + Vector basis_v(basis.size(),tmp_v); typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); - + for(int k=0;k > Bt(thread_max() * Nm); // Thread private - thread_region { vobj* B = Bt.data() + Nm * thread_num(); @@ -86,24 +85,89 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i } }); } +#else + + int nrot = j1-j0; + + + uint64_t oSites =grid->oSites(); + uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead + + // printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock); + + Vector Bt(siteBlock * nrot); + auto Bp=&Bt[0]; + + // GPU readable copy of Eigen matrix + Vector Qt_jv(Nm*Nm); + double *Qt_p = & Qt_jv[0]; + for(int k=0;k void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) { + typedef decltype(basis[0].View()) View; typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); result.Checkerboard() = basis[0].Checkerboard(); auto result_v=result.View(); - thread_for(ss, grid->oSites(),{ - vobj B = Zero(); + Vector basis_v(basis.size(),result_v); + for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ + auto B=coalescedRead(zz); for(int k=k0; k