From 557fa483ffbfac8eabd4011cbd2da0dabdadfe79 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 20 Aug 2024 16:18:43 +0000 Subject: [PATCH] Blas benchmark committed stand alone --- BLAS_benchmark/BatchBlasBench.cc | 1052 ++++++++++++++++++++++++++++++ BLAS_benchmark/compile-command | 2 + 2 files changed, 1054 insertions(+) create mode 100644 BLAS_benchmark/BatchBlasBench.cc create mode 100644 BLAS_benchmark/compile-command diff --git a/BLAS_benchmark/BatchBlasBench.cc b/BLAS_benchmark/BatchBlasBench.cc new file mode 100644 index 00000000..325fed2e --- /dev/null +++ b/BLAS_benchmark/BatchBlasBench.cc @@ -0,0 +1,1052 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GRID_SYCL +#undef GRID_HIP +#undef GRID_CUDA + +#ifdef GRID_HIP +#include +#endif +#ifdef GRID_CUDA +#include +#endif +#ifdef GRID_SYCL +#include +#endif + +#ifdef GRID_SYCL +#include +#include +cl::sycl::queue *theAccelerator; +void acceleratorInit(void) +{ + int nDevices = 1; + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theAccelerator = new sycl::queue (selectedDevice); + auto name = theAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout); +} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);}; +inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerator);}; +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();} +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();} +template void acceleratorPut(T& dev,T&host) +{ + acceleratorCopyToDevice(&host,&dev,sizeof(T)); +} +template T acceleratorGet(T& dev) +{ + T host; + acceleratorCopyFromDevice(&dev,&host,sizeof(T)); + return host; +} +#define accelerator_barrier(dummy) { theAccelerator->wait(); } +#endif + + +/************************************************************** + * Allocator + ************************************************************** + */ +template +class devAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef devAllocator<_Tp1> other; }; + devAllocator() throw() { } + devAllocator(const devAllocator&) throw() { } + template devAllocator(const devAllocator<_Tp1>&) throw() { } + ~devAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + _Tp *ptr = (_Tp*) acceleratorAllocDevice(bytes); + if ( (_Tp*)ptr == (_Tp *) NULL ) { + printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); + } + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + acceleratorFreeDevice((void *)__p,bytes); + } + void construct(pointer __p, const _Tp& __val) { }; + void construct(pointer __p) { }; + void destroy(pointer __p) { }; +}; + +template using deviceVector = std::vector >; + +/************************************************************** + * Microsecond timer + ************************************************************** + */ +inline double usecond(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec; +} + + +typedef float RealF; +typedef double RealD; +typedef std::complex ComplexF; +typedef std::complex ComplexD; + +/////////////////////////////////////////////////////////////////////// +// Need to rearrange lattice data to be in the right format for a +// batched multiply. Might as well make these static, dense packed +/////////////////////////////////////////////////////////////////////// + +#ifdef GRID_HIP + typedef hipblasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_CUDA + typedef cublasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_SYCL + typedef cl::sycl::queue *gridblasHandle_t; +#endif +#ifdef GRID_ONE_MKL + typedef cl::sycl::queue *gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) + typedef int32_t gridblasHandle_t; +#endif + +enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; + +class GridBLAS { +public: + + + static gridblasHandle_t gridblasHandle; + static int gridblasInit; + + static void Init(void) + { + if ( ! gridblasInit ) { +#ifdef GRID_CUDA + std::cout << "cublasCreate"<wait(); +#endif + } + + void gemmBatched(int m,int n, int k, + ComplexD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + ComplexF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + ComplexD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); + + assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + assert(OpB!=GridBLAS_OP_T); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + RealD t0=usecond(); + // std::cout << "ZgemmBatched mnk "<()); + synchronise(); +#if 0 + // This code was used to check the mat mul on Sunspot/OneMKL + std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "< A(m*k); // pointer list to matrices + std::vector B(k*n); + std::vector C(m*n); + // int sda = lda*k; + // int sdb = ldb*k; + // int sdc = ldc*n; + std::cerr << " Checking the GEMM results "< eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + }); + } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + } ); + } else { + assert(0); + } +#endif + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; + // std::cout < &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + assert(OpB!=GridBLAS_OP_T); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (hipblasComplex *) &alpha_p[0], + (hipblasComplex **)&Amk[0], lda, + (hipblasComplex **)&Bkn[0], ldb, + (hipblasComplex *) &beta_p[0], + (hipblasComplex **)&Cmn[0], ldc, + batchCount); + + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (cuComplex *) &alpha_p[0], + (cuComplex **)&Amk[0], lda, + (cuComplex **)&Bkn[0], ldb, + (cuComplex *) &beta_p[0], + (cuComplex **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + + oneapi::mkl::transpose iOpA; + oneapi::mkl::transpose iOpB; + + if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; + if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; + if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; + if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; + if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; + if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; + + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + &iOpA, + &iOpB, + &m64,&n64,&k64, + (ComplexF *) &alpha_p[0], + (const ComplexF **)&Amk[0], (const int64_t *)&lda64, + (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, + (ComplexF *) &beta_p[0], + (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation; use Eigen + if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + }); + } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + } ); + } else { + assert(0); + } +#endif + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; + } + + /////////////////////////////////////////////////////////////////////////// + // Single precision real GEMM + /////////////////////////////////////////////////////////////////////////// + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + assert(OpA!=GridBLAS_OP_C); // Real case no conjugate + assert(OpB!=GridBLAS_OP_C); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasSgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (float *) &alpha_p[0], + (float **)&Amk[0], lda, + (float **)&Bkn[0], ldb, + (float *) &beta_p[0], + (float **)&Cmn[0], ldc, + batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasSgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (float *) &alpha_p[0], + (float **)&Amk[0], lda, + (float **)&Bkn[0], ldb, + (float *) &beta_p[0], + (float **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + + oneapi::mkl::transpose iOpA; + oneapi::mkl::transpose iOpB; + + if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; + if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; + if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; + if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; + if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; + if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; + + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + &iOpA, + &iOpB, + &m64,&n64,&k64, + (float *) &alpha_p[0], + (const float **)&Amk[0], (const int64_t *)&lda64, + (const float **)&Bkn[0], (const int64_t *)&ldb64, + (float *) &beta_p[0], + (float **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation; use Eigen + if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + } ); + } else { + assert(0); + } +#endif + RealD t1=usecond(); + RealD flops = 2.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount; + } + + + /////////////////////////////////////////////////////////////////////////// + // Double precision real GEMM + /////////////////////////////////////////////////////////////////////////// + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + assert(OpA!=GridBLAS_OP_C); // Real case no conjugate + assert(OpB!=GridBLAS_OP_C); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasDgemmBatched(gridblasHandle, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + m,n,k, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasDgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + + oneapi::mkl::transpose iOpA; + oneapi::mkl::transpose iOpB; + + if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; + if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; + if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; + if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; + if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; + if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; + + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + &iOpA, + &iOpB, + &m64,&n64,&k64, + (double *) &alpha_p[0], + (const double **)&Amk[0], (const int64_t *)&lda64, + (const double **)&Bkn[0], (const int64_t *)&ldb64, + (double *) &beta_p[0], + (double **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); + synchronise(); +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation; use Eigen + if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + }); + } else { + assert(0); + } +#endif + RealD t1=usecond(); + RealD flops = 2.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; + } + + template + double benchmark(int M, int N, int K, int BATCH) + { + int32_t N_A = M*K*BATCH; + int32_t N_B = K*N*BATCH; + int32_t N_C = M*N*BATCH; + deviceVector A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex)); + deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex)); + deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex)); + CComplex alpha(1.0); + CComplex beta (1.0); + RealD flops = 8.0*M*N*K*BATCH; + int ncall=10; + deviceVector As(BATCH); + deviceVector Bs(BATCH); + deviceVector Cs(BATCH); + for(int b = 0 ; b < BATCH;b++) { + CComplex *ptr; + ptr = &A[b*M*K]; acceleratorPut(As[b],ptr); + ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr); + ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr); + } + + gemmBatched(M,N,K, + alpha, + As, // m x k + Bs, // k x n + beta, + Cs); + synchronise(); + + RealD t0 = usecond(); + for(int i=0;i +static void BLAS(void) +{ + //int nbasis, int nrhs, int coarseVol + int basis[] = { 16,32,64 }; + int rhs[] = { 8,12,16 }; + int vol = 8*8*8*8; + int blk = 4*4*4*4; + + GridBLAS blas; + + int fpbits = sizeof(CComplex)*4; + std::cout<< "=================================================================================="<(M,N,K,BATCH); + + fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); + + std::cout<< M<<"\t\t"<(M,N,K,BATCH); + + fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); + std::cout<< M<<"\t\t"<(M,N,K,BATCH); + + fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); + std::cout<< M<<"\t\t"<(); + BLAS(); + fclose(FP); +} diff --git a/BLAS_benchmark/compile-command b/BLAS_benchmark/compile-command new file mode 100644 index 00000000..254d603e --- /dev/null +++ b/BLAS_benchmark/compile-command @@ -0,0 +1,2 @@ + +mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench \ No newline at end of file