1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-24 12:45:56 +01:00

first working version of Gram Schmidt using cublas gemm; explicit data type and site vector size has to be removed

This commit is contained in:
Yong-Chull Jang 2020-03-30 18:36:21 -04:00
parent 9266b89ad8
commit 02edbe624f
3 changed files with 144 additions and 12 deletions

2
.gitignore vendored
View File

@ -10,6 +10,8 @@
*~ *~
*# *#
*.sublime-* *.sublime-*
.ctags
tags
# Precompiled Headers # # Precompiled Headers #
####################### #######################

View File

@ -39,6 +39,10 @@ Author: Guido Cossu
#undef USE_LAPACK #undef USE_LAPACK
#define Glog std::cout << GridLogMessage #define Glog std::cout << GridLogMessage
#ifdef GRID_NVCC
#include "cublas_v2.h"
#endif
namespace Grid { namespace Grid {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -89,6 +93,12 @@ class SortEigen {
enum class LanczosType { irbl, rbl }; enum class LanczosType { irbl, rbl };
enum IRBLdiagonalisation {
IRBLdiagonaliseWithDSTEGR,
IRBLdiagonaliseWithQR,
IRBLdiagonaliseWithEigen
};
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted block lanczos // Implicitly restarted block lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@ -107,7 +117,7 @@ private:
int Nblock_m; // Nm/Nu int Nblock_m; // Nm/Nu
int Nconv_test_interval; // Number of skipped vectors when checking a convergence int Nconv_test_interval; // Number of skipped vectors when checking a convergence
RealD eresid; RealD eresid;
IRLdiagonalisation diagonalisation; IRBLdiagonalisation diagonalisation;
int split_test; //test split in the first iteration int split_test; //test split in the first iteration
//////////////////////////////////// ////////////////////////////////////
// Embedded objects // Embedded objects
@ -137,7 +147,7 @@ public:
int _Nm, // total vecs int _Nm, // total vecs
RealD _eresid, // resid in lmd deficit RealD _eresid, // resid in lmd deficit
int _MaxIter, // Max iterations int _MaxIter, // Max iterations
IRLdiagonalisation _diagonalisation = IRLdiagonaliseWithEigen) IRBLdiagonalisation _diagonalisation = IRBLdiagonaliseWithEigen)
: _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid), : _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid),
Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs), Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs),
Nu(_Nu), Nk(_Nk), Nm(_Nm), Nu(_Nu), Nk(_Nk), Nm(_Nm),
@ -211,7 +221,126 @@ public:
#endif #endif
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
normalize(w[i],if_print); assert(normalize(w[i],if_print) !=0);
}
void orthogonalize_blas(std::vector<Field>& w, int _Nu, std::vector<Field>& evec, int _R, int _print=0)
{
#ifdef GRID_NVCC
Glog << "cuBLAS orthogonalize" << std::endl;
typedef typename Field::vector_object vobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename Field::scalar_type MyComplex;
GridBase *grid = w[0].Grid();
//grid->show_decomposition();
//const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->lSites();
//auto w_v = w[0].View();
//cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v._odata[0]);
//cuDoubleComplex *z = w_v._odata._internal;
//thread_for(ss,w_v.size(),{
// Glog << w_v[ss] << std::endl;
//});
//w_v[0]
//exit(0);
//scalar_type *z = (scalar_type *)&w_v[0]; // OK
//cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex *>(&w_v[0]); // OK
cudaError_t cudaStat;
cuDoubleComplex *w_acc, *evec_acc, *c_acc;
cudaStat = cudaMallocManaged((void **)&w_acc, _Nu*sites*12*sizeof(cuDoubleComplex));
Glog << cudaStat << std::endl;
cudaStat = cudaMallocManaged((void **)&evec_acc, _R*sites*12*sizeof(cuDoubleComplex));
Glog << cudaStat << std::endl;
cudaStat = cudaMallocManaged((void **)&c_acc, _Nu*_R*12*sizeof(cuDoubleComplex));
Glog << cudaStat << std::endl;
Glog << "cuBLAS prepare array"<< std::endl;
#if 0 // a trivial test
for (int col=0; col<_Nu; ++col) {
for (size_t row=0; row<sites*12; ++row) {
w_acc[col*sites*12+row].x = 1.0;
w_acc[col*sites*12+row].y = 0.0;
}
}
for (int col=0; col<_R; ++col) {
for (size_t row=0; row<sites*12; ++row) {
evec_acc[col*sites*12+row].x = 1.0;
evec_acc[col*sites*12+row].y = 0.0;
}
}
#else
for (int col=0; col<_Nu; ++col) {
auto w_v = w[col].View();
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]);
for (size_t row=0; row<sites*12; ++row) {
//w_acc[col*sites*12+row].x = z[2*row];
//w_acc[col*sites*12+row].y = z[2*row+1];
w_acc[col*sites*12+row] = z[row];
}
}
for (int col=0; col<_R; ++col) {
auto evec_v = evec[col].View();
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&evec_v[0]);
for (size_t row=0; row<sites*12; ++row) {
//evec_acc[col*sites*12+row].x = z[2*row];
//evec_acc[col*sites*12+row].y = z[2*row+1];
evec_acc[col*sites*12+row] = z[row];
}
}
#endif
Glog << "cuBLAS prepare array done"<< std::endl;
Glog << "cuBLAS Zgemm"<< std::endl;
cublasHandle_t handle;
cublasStatus_t stat;
stat = cublasCreate(&handle);
cuDoubleComplex alpha = make_cuDoubleComplex(1.0,0.0);
cuDoubleComplex beta = make_cuDoubleComplex(0.0,0.0);
stat = cublasZgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N, _R, _Nu, 12*sites,
&alpha, evec_acc, 12*sites, w_acc, 12*sites, &beta, c_acc, _R);
Glog << stat << std::endl;
grid->GlobalSumVector((double*)c_acc,2*_Nu*_R);
cublasDestroy(handle);
Glog << "cuBLAS Zgemm done"<< std::endl;
for (int i=0; i<_Nu; ++i) {
for (size_t j=0; j<_R; ++j) {
cuDoubleComplex z = c_acc[i*_R+j];
MyComplex ip(z.x,z.y);
if (_print) {
Glog << "<evec,w>[" << j << "," << i << "] = "
<< z.x << " + i " << z.y << std::endl;
}
w[i] = w[i] - ip * evec[j];
}
assert(normalize(w[i],_print)!=0);
}
cudaFree(w_acc);
cudaFree(evec_acc);
cudaFree(c_acc);
Glog << "cuBLAS orthogonalize done" << std::endl;
#else
Glog << "BLAS wrapper is not implemented" << std::endl;
exit(1);
#endif
} }
@ -310,10 +439,10 @@ for( int i =0;i<total;i++){
Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl; Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl;
Glog <<" -- size of eval = "<< eval.size() << std::endl; Glog <<" -- size of eval = "<< eval.size() << std::endl;
Glog <<" -- size of evec = "<< evec.size() << std::endl; Glog <<" -- size of evec = "<< evec.size() << std::endl;
if ( diagonalisation == IRLdiagonaliseWithEigen ) { if ( diagonalisation == IRBLdiagonaliseWithEigen ) {
Glog << "Diagonalisation is Eigen "<< std::endl; Glog << "Diagonalisation is Eigen "<< std::endl;
#ifdef USE_LAPACK #ifdef USE_LAPACK
} else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
Glog << "Diagonalisation is LAPACK "<< std::endl; Glog << "Diagonalisation is LAPACK "<< std::endl;
#endif #endif
} else { } else {
@ -520,10 +649,10 @@ for( int i =0;i<total;i++){
Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl;
Glog <<" -- size of eval = "<< eval.size() << std::endl; Glog <<" -- size of eval = "<< eval.size() << std::endl;
Glog <<" -- size of evec = "<< evec.size() << std::endl; Glog <<" -- size of evec = "<< evec.size() << std::endl;
if ( diagonalisation == IRLdiagonaliseWithEigen ) { if ( diagonalisation == IRBLdiagonaliseWithEigen ) {
Glog << "Diagonalisation is Eigen "<< std::endl; Glog << "Diagonalisation is Eigen "<< std::endl;
#ifdef USE_LAPACK #ifdef USE_LAPACK
} else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
Glog << "Diagonalisation is LAPACK "<< std::endl; Glog << "Diagonalisation is LAPACK "<< std::endl;
#endif #endif
} else { } else {
@ -790,7 +919,8 @@ if(!split_test){
orthogonalize(w[u],evec,R); orthogonalize(w[u],evec,R);
} }
#else #else
orthogonalize(w,Nu,evec,R); //orthogonalize(w,Nu,evec,R);
orthogonalize_blas(w,Nu,evec,R);
#endif #endif
// QR part // QR part
for (int u=1; u<Nu; ++u) { for (int u=1; u<Nu; ++u) {
@ -1052,10 +1182,10 @@ if (1){
GridBase *grid) GridBase *grid)
{ {
Qt = Eigen::MatrixXcd::Identity(Nm,Nm); Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
if ( diagonalisation == IRLdiagonaliseWithEigen ) { if ( diagonalisation == IRBLdiagonaliseWithEigen ) {
diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#ifdef USE_LAPACK #ifdef USE_LAPACK
} else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) { } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {

View File

@ -357,8 +357,8 @@ int main (int argc, char ** argv)
JP.Nu, JP.Nk, JP.Nm, JP.Nu, JP.Nk, JP.Nm,
JP.resid, JP.resid,
JP.MaxIter, JP.MaxIter,
IRLdiagonaliseWithEigen); IRBLdiagonaliseWithEigen);
// IRLdiagonaliseWithLAPACK); // IRBLdiagonaliseWithLAPACK);
std::vector<RealD> eval(JP.Nm); std::vector<RealD> eval(JP.Nm);