diff --git a/Grid/algorithms/multigrid/BatchedBlas.h b/Grid/algorithms/multigrid/BatchedBlas.h index 0663a715..5250b2a2 100644 --- a/Grid/algorithms/multigrid/BatchedBlas.h +++ b/Grid/algorithms/multigrid/BatchedBlas.h @@ -58,43 +58,42 @@ NAMESPACE_BEGIN(Grid); class GridBLAS { public: - static gridblasHandle_t gridblasHandle; - static int gridblasInit; + static gridblasHandle_t gridblasHandle; + static int gridblasInit; - static void Init(void) - { - if ( ! gridblasInit ) { + static void Init(void) + { + if ( ! gridblasInit ) { #ifdef GRID_CUDA - std::cout << "cublasCreate"< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); - deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); - deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); - ComplexD alpha(1.0); - ComplexD beta (1.0); - for(int i=0;i<10;i++){ - RealD t0 = usecond(); - for(int s=0;s A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); + deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); + deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); + ComplexD alpha(1.0); + ComplexD beta (1.0); + for(int i=0;i<10;i++){ + RealD t0 = usecond(); + for(int s=0;s &Amk, // pointer list to matrices - deviceVector &Bkn, - ComplexD beta, - deviceVector &Cmn) - { - RealD t2=usecond(); - int32_t batchCount = Amk.size(); - // Use C-row major storage, so transpose calls - int lda = m; // m x k column major - int ldb = k; // k x n column major - int ldc = m; // m x b column major - static deviceVector alpha_p(1); - static deviceVector beta_p(1); - // can prestore the 1 and the zero on device - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); - RealD t0=usecond(); + void gemmBatched(int m,int n, int k, + ComplexD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + RealD t0=usecond(); + // std::cout << "hipblasZgemmBatched mnk "<gemm_batch & OneAPI +#warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) - // Need a default/reference implementation - for (int p = 0; p < batchCount; ++p) { - for (int mm = 0; mm < m; ++mm) { - for (int nn = 0; nn < n; ++nn) { - ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; - } - } - } + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } #endif + synchronise(); RealD t1=usecond(); - // std::cout << " hipblas synchronised " < alpha_p(1); - deviceVector beta_p(1); - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + void gemmBatched(int m,int n, int k, + ComplexF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); + RealD t0=usecond(); + // std::cout << "hipblasZgemmBatched mnk "<gemm_batch & OneAPI +#warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + synchronise(); + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; + std::cout < &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); + RealD t0=usecond(); + // std::cout << "hipblasZgemmBatched mnk "<gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + RealD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + synchronise(); + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount; + std::cout < &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); + RealD t0=usecond(); + // std::cout << "hipblasZgemmBatched mnk "<gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + RealD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + synchronise(); + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; + std::cout < alpha_p(1); + deviceVector beta_p(1); + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + std::cout << "blasZgemmStridedBatched mnk "<