1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-25 05:05:56 +01:00

Make precision switchable for cublas routines

This commit is contained in:
Chulwoo Jung 2020-07-22 14:49:32 -04:00
parent 7e70df27e4
commit 43298ef681

View File

@ -43,6 +43,18 @@ Author: Guido Cossu
#include "cublas_v2.h" #include "cublas_v2.h"
#endif #endif
#if 0
#define CUDA_COMPLEX cuDoubleComplex
#define CUDA_FLOAT double
#define MAKE_CUDA_COMPLEX make_cuDoubleComplex
#define CUDA_GEMM cublasZgemm
#else
#define CUDA_COMPLEX cuComplex
#define CUDA_FLOAT float
#define MAKE_CUDA_COMPLEX make_cuComplex
#define CUDA_GEMM cublasCgemm
#endif
namespace Grid { namespace Grid {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -134,7 +146,7 @@ private:
///////////////////////// /////////////////////////
#ifdef GRID_CUDA #ifdef GRID_CUDA
cudaError_t cudaStat; cudaError_t cudaStat;
cuDoubleComplex *w_acc, *evec_acc, *c_acc; CUDA_COMPLEX *w_acc, *evec_acc, *c_acc;
#endif #endif
int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc
@ -264,7 +276,7 @@ public:
for (int col=0; col<Nu; ++col) { for (int col=0; col<Nu; ++col) {
// auto w_v = w[col].View(); // auto w_v = w[col].View();
autoView( w_v,w[col], AcceleratorWrite); autoView( w_v,w[col], AcceleratorWrite);
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]); CUDA_COMPLEX *z = reinterpret_cast<CUDA_COMPLEX*>(&w_v[0]);
// Glog << "col= "<<col <<" z= "<<z <<std::endl; // Glog << "col= "<<col <<" z= "<<z <<std::endl;
for (size_t row=0; row<sites*12; ++row) { for (size_t row=0; row<sites*12; ++row) {
w_acc[col*sites*12+row] = z[row]; w_acc[col*sites*12+row] = z[row];
@ -290,27 +302,27 @@ public:
for (int col=0; col<Nevec_acc; ++col) { for (int col=0; col<Nevec_acc; ++col) {
// auto evec_v = evec[b*Nevec_acc+col].View(); // auto evec_v = evec[b*Nevec_acc+col].View();
autoView( evec_v,evec[b*Nevec_acc+col], AcceleratorWrite); autoView( evec_v,evec[b*Nevec_acc+col], AcceleratorWrite);
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&evec_v[0]); CUDA_COMPLEX *z = reinterpret_cast<CUDA_COMPLEX*>(&evec_v[0]);
// Glog << "col= "<<col <<" z= "<<z <<std::endl; // Glog << "col= "<<col <<" z= "<<z <<std::endl;
for (size_t row=0; row<sites*12; ++row) { for (size_t row=0; row<sites*12; ++row) {
evec_acc[col*sites*12+row] = z[row]; evec_acc[col*sites*12+row] = z[row];
} }
} }
#endif #endif
cuDoubleComplex alpha = make_cuDoubleComplex(1.0,0.0); CUDA_COMPLEX alpha = MAKE_CUDA_COMPLEX(1.0,0.0);
cuDoubleComplex beta = make_cuDoubleComplex(0.0,0.0); CUDA_COMPLEX beta = MAKE_CUDA_COMPLEX(0.0,0.0);
stat = cublasZgemm(handle, CUBLAS_OP_C, CUBLAS_OP_N, Nevec_acc, Nu, 12*sites, stat = CUDA_GEMM(handle, CUBLAS_OP_C, CUBLAS_OP_N, Nevec_acc, Nu, 12*sites,
&alpha, &alpha,
evec_acc, 12*sites, w_acc, 12*sites, evec_acc, 12*sites, w_acc, 12*sites,
&beta, &beta,
c_acc, Nevec_acc); c_acc, Nevec_acc);
//Glog << stat << std::endl; //Glog << stat << std::endl;
grid->GlobalSumVector((double*)c_acc,2*Nu*Nevec_acc); grid->GlobalSumVector((CUDA_FLOAT*)c_acc,2*Nu*Nevec_acc);
#if 0 #if 0
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
for (size_t j=0; j<Nevec_acc; ++j) { for (size_t j=0; j<Nevec_acc; ++j) {
cuDoubleComplex z = c_acc[i*Nevec_acc+j]; CUDA_COMPLEX z = c_acc[i*Nevec_acc+j];
MyComplex ip(z.x,z.y); MyComplex ip(z.x,z.y);
if (do_print) { if (do_print) {
Glog << "<evec,w>[" << j << "," << i << "] = " Glog << "<evec,w>[" << j << "," << i << "] = "
@ -320,9 +332,9 @@ public:
} }
} }
#else #else
alpha = make_cuDoubleComplex(-1.0,0.0); alpha = MAKE_CUDA_COMPLEX(-1.0,0.0);
beta = make_cuDoubleComplex(1.0,0.0); beta = MAKE_CUDA_COMPLEX(1.0,0.0);
stat = cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 12*sites, Nu, Nevec_acc, stat = CUDA_GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, 12*sites, Nu, Nevec_acc,
&alpha, &alpha,
evec_acc, 12*sites, c_acc, Nevec_acc, evec_acc, 12*sites, c_acc, Nevec_acc,
&beta, &beta,
@ -334,7 +346,7 @@ public:
for (int col=0; col<Nu; ++col) { for (int col=0; col<Nu; ++col) {
// auto w_v = w[col].View(); // auto w_v = w[col].View();
autoView( w_v,w[col], AcceleratorWrite); autoView( w_v,w[col], AcceleratorWrite);
cuDoubleComplex *z = reinterpret_cast<cuDoubleComplex*>(&w_v[0]); CUDA_COMPLEX *z = reinterpret_cast<CUDA_COMPLEX*>(&w_v[0]);
for (size_t row=0; row<sites*12; ++row) { for (size_t row=0; row<sites*12; ++row) {
z[row] = w_acc[col*sites*12+row]; z[row] = w_acc[col*sites*12+row];
} }
@ -433,11 +445,11 @@ for( int i =0;i<total;i++){
//const uint64_t nsimd = grid->Nsimd(); //const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->lSites(); const uint64_t sites = grid->lSites();
cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(cuDoubleComplex)); cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(CUDA_COMPLEX));
Glog << "w_acc= "<<w_acc << " "<< cudaStat << std::endl; Glog << "w_acc= "<<w_acc << " "<< cudaStat << std::endl;
cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(cuDoubleComplex)); cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_COMPLEX));
Glog << "evec_acc= "<<evec_acc << " "<< cudaStat << std::endl; Glog << "evec_acc= "<<evec_acc << " "<< cudaStat << std::endl;
cudaStat = cudaMallocManaged((void **)&c_acc, Nu*Nevec_acc*sizeof(cuDoubleComplex)); cudaStat = cudaMallocManaged((void **)&c_acc, Nu*Nevec_acc*sizeof(CUDA_COMPLEX));
Glog << "c_acc= "<<c_acc << " "<< cudaStat << std::endl; Glog << "c_acc= "<<c_acc << " "<< cudaStat << std::endl;
// exit(-42); // exit(-42);
#endif #endif
@ -1508,4 +1520,8 @@ if (1){
} }
#undef Glog #undef Glog
#undef USE_LAPACK #undef USE_LAPACK
#undef CUDA_COMPLEX
#undef CUDA_FLOAT
#undef MAKE_CUDA_COMPLEX
#undef CUDA_GEMM
#endif #endif