diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h new file mode 100644 index 00000000..82da2d5d --- /dev/null +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -0,0 +1,685 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: BatchedBlas.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#ifdef GRID_HIP +#include +#endif +#ifdef GRID_CUDA +#include +#endif +#ifdef GRID_SYCL +#error // need oneMKL version +#endif + +/////////////////////////////////////////////////////////////////////// +// Need to rearrange lattice data to be in the right format for a +// batched multiply. Might as well make these static, dense packed +/////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); +#ifdef GRID_HIP + typedef hipblasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_CUDA + typedef cudablasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_SYCL + typedef int32_t gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + typedef int32_t gridblasHandle_t; +#endif + +enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; + +class GridBLAS { +public: + + + static gridblasHandle_t gridblasHandle; + static int gridblasInit; + + static void Init(void) + { + if ( ! gridblasInit ) { +#ifdef GRID_CUDA + std::cout << "cublasCreate"< &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + ComplexF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + ComplexD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + RealD t0=usecond(); + // std::cout << "ZgemmBatched mnk "<gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + // synchronise(); + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; + // std::cout < &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (hipblasComplex *) &alpha_p[0], + (hipblasComplex **)&Amk[0], lda, + (hipblasComplex **)&Bkn[0], ldb, + (hipblasComplex *) &beta_p[0], + (hipblasComplex **)&Cmn[0], ldc, + batchCount); + + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (cuComplex *) &alpha_p[0], + (cuComplex **)&Amk[0], lda, + (cuComplex **)&Bkn[0], ldb, + (cuComplex *) &beta_p[0], + (cuComplex **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + //MKL’s cblas_gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; + } + + /////////////////////////////////////////////////////////////////////////// + // Single precision real GEMM + /////////////////////////////////////////////////////////////////////////// + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasSgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (float *) &alpha_p[0], + (float **)&Amk[0], lda, + (float **)&Bkn[0], ldb, + (float *) &beta_p[0], + (float **)&Cmn[0], ldc, + batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasSgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (float *) &alpha_p[0], + (float **)&Amk[0], lda, + (float **)&Bkn[0], ldb, + (float *) &beta_p[0], + (float **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + //MKL’s cblas_gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + RealD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + RealD t1=usecond(); + RealD flops = 2.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount; + } + + + /////////////////////////////////////////////////////////////////////////// + // Double precision real GEMM + /////////////////////////////////////////////////////////////////////////// + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasDgemmBatched(gridblasHandle, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + m,n,k, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasDgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + /* + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t batchCount64=batchCount; + oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, + onemkl::transpose::N, + onemkl::transpose::N, + &m64,&n64,&k64, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + 1,&batchCount64); + */ + //MKL’s cblas_gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + RealD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + RealD t1=usecond(); + RealD flops = 2.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; + } + + + + //////////////////////////////////////////////////////////////////////////////////////////////// + // Strided case used by benchmark, but generally unused in Grid + // Keep a code example in double complex, but don't generate the single and real variants for now + //////////////////////////////////////////////////////////////////////////////////////////////// + + void gemmStridedBatched(int m,int n, int k, + ComplexD alpha, + ComplexD* Amk, // pointer list to matrices + ComplexD* Bkn, + ComplexD beta, + ComplexD* Cmn, + int batchCount) + { + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + int sda = m*k; + int sdb = k*n; + int sdc = m*n; + deviceVector alpha_p(1); + deviceVector beta_p(1); + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + std::cout << "blasZgemmStridedBatched mnk "< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); + deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); + deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); + ComplexD alpha(1.0); + ComplexD beta (1.0); + for(int i=0;i<10;i++){ + RealD t0 = usecond(); + for(int s=0;s