mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-24 12:45:56 +01:00
first working version of Gram Schmidt using cublas gemm; explicit data type and site vector size has to be removed
This commit is contained in:
parent
9266b89ad8
commit
02edbe624f
2
.gitignore
vendored
2
.gitignore
vendored
@ -10,6 +10,8 @@
|
|||||||
*~
|
*~
|
||||||
*#
|
*#
|
||||||
*.sublime-*
|
*.sublime-*
|
||||||
|
.ctags
|
||||||
|
tags
|
||||||
|
|
||||||
# Precompiled Headers #
|
# Precompiled Headers #
|
||||||
#######################
|
#######################
|
||||||
|
@ -39,6 +39,10 @@ Author: Guido Cossu
|
|||||||
#undef USE_LAPACK
|
#undef USE_LAPACK
|
||||||
#define Glog std::cout << GridLogMessage
|
#define Glog std::cout << GridLogMessage
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
#include "cublas_v2.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -89,6 +93,12 @@ class SortEigen {
|
|||||||
|
|
||||||
enum class LanczosType { irbl, rbl };
|
enum class LanczosType { irbl, rbl };
|
||||||
|
|
||||||
|
enum IRBLdiagonalisation {
|
||||||
|
IRBLdiagonaliseWithDSTEGR,
|
||||||
|
IRBLdiagonaliseWithQR,
|
||||||
|
IRBLdiagonaliseWithEigen
|
||||||
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Implicitly restarted block lanczos
|
// Implicitly restarted block lanczos
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
@ -107,7 +117,7 @@ private:
|
|||||||
int Nblock_m; // Nm/Nu
|
int Nblock_m; // Nm/Nu
|
||||||
int Nconv_test_interval; // Number of skipped vectors when checking a convergence
|
int Nconv_test_interval; // Number of skipped vectors when checking a convergence
|
||||||
RealD eresid;
|
RealD eresid;
|
||||||
IRLdiagonalisation diagonalisation;
|
IRBLdiagonalisation diagonalisation;
|
||||||
int split_test; //test split in the first iteration
|
int split_test; //test split in the first iteration
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Embedded objects
|
// Embedded objects
|
||||||
@ -137,7 +147,7 @@ public:
|
|||||||
int _Nm, // total vecs
|
int _Nm, // total vecs
|
||||||
RealD _eresid, // resid in lmd deficit
|
RealD _eresid, // resid in lmd deficit
|
||||||
int _MaxIter, // Max iterations
|
int _MaxIter, // Max iterations
|
||||||
IRLdiagonalisation _diagonalisation = IRLdiagonaliseWithEigen)
|
IRBLdiagonalisation _diagonalisation = IRBLdiagonaliseWithEigen)
|
||||||
: _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid),
|
: _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid),
|
||||||
Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs),
|
Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs),
|
||||||
Nu(_Nu), Nk(_Nk), Nm(_Nm),
|
Nu(_Nu), Nk(_Nk), Nm(_Nm),
|
||||||
@ -211,7 +221,126 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
}}
|
}}
|
||||||
for(int i=0; i<_Nu; ++i)
|
for(int i=0; i<_Nu; ++i)
|
||||||
normalize(w[i],if_print);
|
assert(normalize(w[i],if_print) !=0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void orthogonalize_blas(std::vector<Field>& w, int _Nu, std::vector<Field>& evec, int _R, int _print=0)
|
||||||
|
{
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
Glog << "cuBLAS orthogonalize" << std::endl;
|
||||||
|
|
||||||
|
typedef typename Field::vector_object vobj;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
typedef typename Field::scalar_type MyComplex;
|
||||||
|
|
||||||
|
GridBase *grid = w[0].Grid();
|
||||||
|
//grid->show_decomposition();
|
||||||
|
//const uint64_t nsimd = grid->Nsimd();
|
||||||
|
const uint64_t sites = grid->lSites();
|
||||||
|
|
||||||
|
//auto w_v = w[0].View();
|
||||||
|
//cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v._odata[0]);
|
||||||
|
//cuDoubleComplex *z = w_v._odata._internal;
|
||||||
|
//thread_for(ss,w_v.size(),{
|
||||||
|
// Glog << w_v[ss] << std::endl;
|
||||||
|
//});
|
||||||
|
//w_v[0]
|
||||||
|
//exit(0);
|
||||||
|
//scalar_type *z = (scalar_type *)&w_v[0]; // OK
|
||||||
|
//cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex *>(&w_v[0]); // OK
|
||||||
|
|
||||||
|
cudaError_t cudaStat;
|
||||||
|
|
||||||
|
cuDoubleComplex *w_acc, *evec_acc, *c_acc;
|
||||||
|
|
||||||
|
cudaStat = cudaMallocManaged((void **)&w_acc, _Nu*sites*12*sizeof(cuDoubleComplex));
|
||||||
|
Glog << cudaStat << std::endl;
|
||||||
|
cudaStat = cudaMallocManaged((void **)&evec_acc, _R*sites*12*sizeof(cuDoubleComplex));
|
||||||
|
Glog << cudaStat << std::endl;
|
||||||
|
cudaStat = cudaMallocManaged((void **)&c_acc, _Nu*_R*12*sizeof(cuDoubleComplex));
|
||||||
|
Glog << cudaStat << std::endl;
|
||||||
|
|
||||||
|
Glog << "cuBLAS prepare array"<< std::endl;
|
||||||
|
#if 0 // a trivial test
|
||||||
|
for (int col=0; col<_Nu; ++col) {
|
||||||
|
for (size_t row=0; row<sites*12; ++row) {
|
||||||
|
w_acc[col*sites*12+row].x = 1.0;
|
||||||
|
w_acc[col*sites*12+row].y = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int col=0; col<_R; ++col) {
|
||||||
|
for (size_t row=0; row<sites*12; ++row) {
|
||||||
|
evec_acc[col*sites*12+row].x = 1.0;
|
||||||
|
evec_acc[col*sites*12+row].y = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (int col=0; col<_Nu; ++col) {
|
||||||
|
auto w_v = w[col].View();
|
||||||
|
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]);
|
||||||
|
for (size_t row=0; row<sites*12; ++row) {
|
||||||
|
//w_acc[col*sites*12+row].x = z[2*row];
|
||||||
|
//w_acc[col*sites*12+row].y = z[2*row+1];
|
||||||
|
w_acc[col*sites*12+row] = z[row];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int col=0; col<_R; ++col) {
|
||||||
|
auto evec_v = evec[col].View();
|
||||||
|
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&evec_v[0]);
|
||||||
|
for (size_t row=0; row<sites*12; ++row) {
|
||||||
|
//evec_acc[col*sites*12+row].x = z[2*row];
|
||||||
|
//evec_acc[col*sites*12+row].y = z[2*row+1];
|
||||||
|
evec_acc[col*sites*12+row] = z[row];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
Glog << "cuBLAS prepare array done"<< std::endl;
|
||||||
|
|
||||||
|
Glog << "cuBLAS Zgemm"<< std::endl;
|
||||||
|
|
||||||
|
cublasHandle_t handle;
|
||||||
|
cublasStatus_t stat;
|
||||||
|
|
||||||
|
stat = cublasCreate(&handle);
|
||||||
|
cuDoubleComplex alpha = make_cuDoubleComplex(1.0,0.0);
|
||||||
|
cuDoubleComplex beta = make_cuDoubleComplex(0.0,0.0);
|
||||||
|
stat = cublasZgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N, _R, _Nu, 12*sites,
|
||||||
|
&alpha, evec_acc, 12*sites, w_acc, 12*sites, &beta, c_acc, _R);
|
||||||
|
Glog << stat << std::endl;
|
||||||
|
|
||||||
|
grid->GlobalSumVector((double*)c_acc,2*_Nu*_R);
|
||||||
|
|
||||||
|
cublasDestroy(handle);
|
||||||
|
|
||||||
|
Glog << "cuBLAS Zgemm done"<< std::endl;
|
||||||
|
|
||||||
|
for (int i=0; i<_Nu; ++i) {
|
||||||
|
for (size_t j=0; j<_R; ++j) {
|
||||||
|
cuDoubleComplex z = c_acc[i*_R+j];
|
||||||
|
MyComplex ip(z.x,z.y);
|
||||||
|
if (_print) {
|
||||||
|
Glog << "<evec,w>[" << j << "," << i << "] = "
|
||||||
|
<< z.x << " + i " << z.y << std::endl;
|
||||||
|
}
|
||||||
|
w[i] = w[i] - ip * evec[j];
|
||||||
|
}
|
||||||
|
assert(normalize(w[i],_print)!=0);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaFree(w_acc);
|
||||||
|
cudaFree(evec_acc);
|
||||||
|
cudaFree(c_acc);
|
||||||
|
|
||||||
|
Glog << "cuBLAS orthogonalize done" << std::endl;
|
||||||
|
#else
|
||||||
|
Glog << "BLAS wrapper is not implemented" << std::endl;
|
||||||
|
exit(1);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -310,10 +439,10 @@ for( int i =0;i<total;i++){
|
|||||||
Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl;
|
Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl;
|
||||||
Glog <<" -- size of eval = "<< eval.size() << std::endl;
|
Glog <<" -- size of eval = "<< eval.size() << std::endl;
|
||||||
Glog <<" -- size of evec = "<< evec.size() << std::endl;
|
Glog <<" -- size of evec = "<< evec.size() << std::endl;
|
||||||
if ( diagonalisation == IRLdiagonaliseWithEigen ) {
|
if ( diagonalisation == IRBLdiagonaliseWithEigen ) {
|
||||||
Glog << "Diagonalisation is Eigen "<< std::endl;
|
Glog << "Diagonalisation is Eigen "<< std::endl;
|
||||||
#ifdef USE_LAPACK
|
#ifdef USE_LAPACK
|
||||||
} else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) {
|
} else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
|
||||||
Glog << "Diagonalisation is LAPACK "<< std::endl;
|
Glog << "Diagonalisation is LAPACK "<< std::endl;
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
@ -520,10 +649,10 @@ for( int i =0;i<total;i++){
|
|||||||
Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl;
|
Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl;
|
||||||
Glog <<" -- size of eval = "<< eval.size() << std::endl;
|
Glog <<" -- size of eval = "<< eval.size() << std::endl;
|
||||||
Glog <<" -- size of evec = "<< evec.size() << std::endl;
|
Glog <<" -- size of evec = "<< evec.size() << std::endl;
|
||||||
if ( diagonalisation == IRLdiagonaliseWithEigen ) {
|
if ( diagonalisation == IRBLdiagonaliseWithEigen ) {
|
||||||
Glog << "Diagonalisation is Eigen "<< std::endl;
|
Glog << "Diagonalisation is Eigen "<< std::endl;
|
||||||
#ifdef USE_LAPACK
|
#ifdef USE_LAPACK
|
||||||
} else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) {
|
} else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
|
||||||
Glog << "Diagonalisation is LAPACK "<< std::endl;
|
Glog << "Diagonalisation is LAPACK "<< std::endl;
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
@ -790,7 +919,8 @@ if(!split_test){
|
|||||||
orthogonalize(w[u],evec,R);
|
orthogonalize(w[u],evec,R);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
orthogonalize(w,Nu,evec,R);
|
//orthogonalize(w,Nu,evec,R);
|
||||||
|
orthogonalize_blas(w,Nu,evec,R);
|
||||||
#endif
|
#endif
|
||||||
// QR part
|
// QR part
|
||||||
for (int u=1; u<Nu; ++u) {
|
for (int u=1; u<Nu; ++u) {
|
||||||
@ -1052,10 +1182,10 @@ if (1){
|
|||||||
GridBase *grid)
|
GridBase *grid)
|
||||||
{
|
{
|
||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
if ( diagonalisation == IRLdiagonaliseWithEigen ) {
|
if ( diagonalisation == IRBLdiagonaliseWithEigen ) {
|
||||||
diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
|
diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
|
||||||
#ifdef USE_LAPACK
|
#ifdef USE_LAPACK
|
||||||
} else if ( diagonalisation == IRLdiagonaliseWithLAPACK ) {
|
} else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
|
||||||
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
|
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
|
@ -357,8 +357,8 @@ int main (int argc, char ** argv)
|
|||||||
JP.Nu, JP.Nk, JP.Nm,
|
JP.Nu, JP.Nk, JP.Nm,
|
||||||
JP.resid,
|
JP.resid,
|
||||||
JP.MaxIter,
|
JP.MaxIter,
|
||||||
IRLdiagonaliseWithEigen);
|
IRBLdiagonaliseWithEigen);
|
||||||
// IRLdiagonaliseWithLAPACK);
|
// IRBLdiagonaliseWithLAPACK);
|
||||||
|
|
||||||
std::vector<RealD> eval(JP.Nm);
|
std::vector<RealD> eval(JP.Nm);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user