diff --git a/Grid/algorithms/multigrid/BatchedBlas.h b/Grid/algorithms/multigrid/BatchedBlas.h index 2ed55663..82da2d5d 100644 --- a/Grid/algorithms/multigrid/BatchedBlas.h +++ b/Grid/algorithms/multigrid/BatchedBlas.h @@ -55,9 +55,12 @@ NAMESPACE_BEGIN(Grid); typedef int32_t gridblasHandle_t; #endif +enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; + class GridBLAS { public: + static gridblasHandle_t gridblasHandle; static int gridblasInit; @@ -109,37 +112,71 @@ public: accelerator_barrier(); #endif } - void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil) + + void gemmBatched(int m,int n, int k, + ComplexD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) { - int32_t N_A = nbasis*nbasis*coarseVol*nstencil; - int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time - int32_t N_C = nbasis*nrhs*coarseVol*nstencil; - deviceVector A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); - deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); - deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); - ComplexD alpha(1.0); - ComplexD beta (1.0); - for(int i=0;i<10;i++){ - RealD t0 = usecond(); - for(int s=0;s &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); } - void gemmBatched(int m,int n, int k, + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, ComplexD alpha, deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, @@ -148,23 +185,36 @@ public: { RealD t2=usecond(); int32_t batchCount = Amk.size(); - // Use C-row major storage, so transpose calls + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); + int lda = m; // m x k column major int ldb = k; // k x n column major int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); static deviceVector beta_p(1); // can prestore the 1 and the zero on device acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); RealD t0=usecond(); - // std::cout << "hipblasZgemmBatched mnk "< &Amk, // pointer list to matrices deviceVector &Bkn, @@ -222,23 +283,35 @@ public: { RealD t2=usecond(); int32_t batchCount = Amk.size(); - // Use C-row major storage, so transpose calls + int lda = m; // m x k column major int ldb = k; // k x n column major int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; static deviceVector alpha_p(1); static deviceVector beta_p(1); // can prestore the 1 and the zero on device acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); RealD t0=usecond(); - // std::cout << "hipblasZgemmBatched mnk "< &Amk, // pointer list to matrices deviceVector &Bkn, @@ -300,23 +380,35 @@ public: { RealD t2=usecond(); int32_t batchCount = Amk.size(); - // Use C-row major storage, so transpose calls + int lda = m; // m x k column major int ldb = k; // k x n column major int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; static deviceVector alpha_p(1); static deviceVector beta_p(1); // can prestore the 1 and the zero on device acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); RealD t0=usecond(); - // std::cout << "hipblasZgemmBatched mnk "< &Amk, // pointer list to matrices deviceVector &Bkn, @@ -378,20 +477,33 @@ public: { RealD t2=usecond(); int32_t batchCount = Amk.size(); - // Use C-row major storage, so transpose calls + int lda = m; // m x k column major int ldb = k; // k x n column major int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); static deviceVector beta_p(1); // can prestore the 1 and the zero on device acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); RealD t0=usecond(); - // std::cout << "hipblasZgemmBatched mnk "< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); + deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); + deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); + ComplexD alpha(1.0); + ComplexD beta (1.0); + for(int i=0;i<10;i++){ + RealD t0 = usecond(); + for(int s=0;s