diff --git a/Grid/algorithms/multigrid/BatchedBlas.h b/Grid/algorithms/multigrid/BatchedBlas.h deleted file mode 100644 index 82da2d5d..00000000 --- a/Grid/algorithms/multigrid/BatchedBlas.h +++ /dev/null @@ -1,685 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: BatchedBlas.h - - Copyright (C) 2023 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#pragma once - -#ifdef GRID_HIP -#include -#endif -#ifdef GRID_CUDA -#include -#endif -#ifdef GRID_SYCL -#error // need oneMKL version -#endif - -/////////////////////////////////////////////////////////////////////// -// Need to rearrange lattice data to be in the right format for a -// batched multiply. Might as well make these static, dense packed -/////////////////////////////////////////////////////////////////////// -NAMESPACE_BEGIN(Grid); -#ifdef GRID_HIP - typedef hipblasHandle_t gridblasHandle_t; -#endif -#ifdef GRID_CUDA - typedef cudablasHandle_t gridblasHandle_t; -#endif -#ifdef GRID_SYCL - typedef int32_t gridblasHandle_t; -#endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) - typedef int32_t gridblasHandle_t; -#endif - -enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; - -class GridBLAS { -public: - - - static gridblasHandle_t gridblasHandle; - static int gridblasInit; - - static void Init(void) - { - if ( ! gridblasInit ) { -#ifdef GRID_CUDA - std::cout << "cublasCreate"< &Amk, // pointer list to matrices - deviceVector &Bkn, - ComplexD beta, - deviceVector &Cmn) - { - gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, - m,n,k, - alpha, - Amk, - Bkn, - beta, - Cmn); - } - void gemmBatched(int m,int n, int k, - ComplexF alpha, - deviceVector &Amk, // pointer list to matrices - deviceVector &Bkn, - ComplexF beta, - deviceVector &Cmn) - { - gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, - m,n,k, - alpha, - Amk, - Bkn, - beta, - Cmn); - } - void gemmBatched(int m,int n, int k, - RealD alpha, - deviceVector &Amk, // pointer list to matrices - deviceVector &Bkn, - RealD beta, - deviceVector &Cmn) - { - gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, - m,n,k, - alpha, - Amk, - Bkn, - beta, - Cmn); - } - void gemmBatched(int m,int n, int k, - RealF alpha, - deviceVector &Amk, // pointer list to matrices - deviceVector &Bkn, - RealF beta, - deviceVector &Cmn) - { - gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, - m,n,k, - alpha, - Amk, - Bkn, - beta, - Cmn); - } - - void gemmBatched(GridBLASOperation_t OpA, - GridBLASOperation_t OpB, - int m,int n, int k, - ComplexD alpha, - deviceVector &Amk, // pointer list to matrices - deviceVector &Bkn, - ComplexD beta, - deviceVector &Cmn) - { - RealD t2=usecond(); - int32_t batchCount = Amk.size(); - assert(Bkn.size()==batchCount); - assert(Cmn.size()==batchCount); - - int lda = m; // m x k column major - int ldb = k; // k x n column major - int ldc = m; // m x b column major - if(OpA!=GridBLAS_OP_N) - lda = k; - if(OpB!=GridBLAS_OP_N) - ldb = n; - - static deviceVector alpha_p(1); - static deviceVector beta_p(1); - // can prestore the 1 and the zero on device - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); - RealD t0=usecond(); - // std::cout << "ZgemmBatched mnk "<gemm_batch & OneAPI -#warning "oneMKL implementation not built " -#endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) - // Need a default/reference implementation - for (int p = 0; p < batchCount; ++p) { - for (int mm = 0; mm < m; ++mm) { - for (int nn = 0; nn < n; ++nn) { - ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; - } - } - } -#endif - // synchronise(); - RealD t1=usecond(); - RealD flops = 8.0*m*n*k*batchCount; - RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; - // std::cout < &Amk, // pointer list to matrices - deviceVector &Bkn, - ComplexF beta, - deviceVector &Cmn) - { - RealD t2=usecond(); - int32_t batchCount = Amk.size(); - - int lda = m; // m x k column major - int ldb = k; // k x n column major - int ldc = m; // m x b column major - if(OpA!=GridBLAS_OP_N) - lda = k; - if(OpB!=GridBLAS_OP_N) - ldb = n; - static deviceVector alpha_p(1); - static deviceVector beta_p(1); - // can prestore the 1 and the zero on device - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); - RealD t0=usecond(); - - assert(Bkn.size()==batchCount); - assert(Cmn.size()==batchCount); -#ifdef GRID_HIP - hipblasOperation_t hOpA; - hipblasOperation_t hOpB; - if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; - if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; - if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; - if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; - if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; - if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; - auto err = hipblasCgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (hipblasComplex *) &alpha_p[0], - (hipblasComplex **)&Amk[0], lda, - (hipblasComplex **)&Bkn[0], ldb, - (hipblasComplex *) &beta_p[0], - (hipblasComplex **)&Cmn[0], ldc, - batchCount); - - assert(err==HIPBLAS_STATUS_SUCCESS); -#endif -#ifdef GRID_CUDA - cublasOperation_t hOpA; - cublasOperation_t hOpB; - if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; - if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; - if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; - if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; - if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; - if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; - auto err = cublasCgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (cuComplex *) &alpha_p[0], - (cuComplex **)&Amk[0], lda, - (cuComplex **)&Bkn[0], ldb, - (cuComplex *) &beta_p[0], - (cuComplex **)&Cmn[0], ldc, - batchCount); - assert(err==CUBLAS_STATUS_SUCCESS); -#endif -#ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " -#endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) - // Need a default/reference implementation - for (int p = 0; p < batchCount; ++p) { - for (int mm = 0; mm < m; ++mm) { - for (int nn = 0; nn < n; ++nn) { - ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; - } - } - } -#endif - RealD t1=usecond(); - RealD flops = 8.0*m*n*k*batchCount; - RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; - } - - /////////////////////////////////////////////////////////////////////////// - // Single precision real GEMM - /////////////////////////////////////////////////////////////////////////// - - void gemmBatched(GridBLASOperation_t OpA, - GridBLASOperation_t OpB, - int m,int n, int k, - RealF alpha, - deviceVector &Amk, // pointer list to matrices - deviceVector &Bkn, - RealF beta, - deviceVector &Cmn) - { - RealD t2=usecond(); - int32_t batchCount = Amk.size(); - - int lda = m; // m x k column major - int ldb = k; // k x n column major - int ldc = m; // m x b column major - if(OpA!=GridBLAS_OP_N) - lda = k; - if(OpB!=GridBLAS_OP_N) - ldb = n; - static deviceVector alpha_p(1); - static deviceVector beta_p(1); - // can prestore the 1 and the zero on device - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); - RealD t0=usecond(); - - assert(Bkn.size()==batchCount); - assert(Cmn.size()==batchCount); -#ifdef GRID_HIP - hipblasOperation_t hOpA; - hipblasOperation_t hOpB; - if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; - if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; - if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; - if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; - if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; - if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; - auto err = hipblasSgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (float *) &alpha_p[0], - (float **)&Amk[0], lda, - (float **)&Bkn[0], ldb, - (float *) &beta_p[0], - (float **)&Cmn[0], ldc, - batchCount); - assert(err==HIPBLAS_STATUS_SUCCESS); -#endif -#ifdef GRID_CUDA - cublasOperation_t hOpA; - cublasOperation_t hOpB; - if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; - if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; - if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; - if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; - if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; - if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; - auto err = cublasSgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (float *) &alpha_p[0], - (float **)&Amk[0], lda, - (float **)&Bkn[0], ldb, - (float *) &beta_p[0], - (float **)&Cmn[0], ldc, - batchCount); - assert(err==CUBLAS_STATUS_SUCCESS); -#endif -#ifdef GRID_SYCL - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " -#endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) - // Need a default/reference implementation - for (int p = 0; p < batchCount; ++p) { - for (int mm = 0; mm < m; ++mm) { - for (int nn = 0; nn < n; ++nn) { - RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; - } - } - } -#endif - RealD t1=usecond(); - RealD flops = 2.0*m*n*k*batchCount; - RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount; - } - - - /////////////////////////////////////////////////////////////////////////// - // Double precision real GEMM - /////////////////////////////////////////////////////////////////////////// - - void gemmBatched(GridBLASOperation_t OpA, - GridBLASOperation_t OpB, - int m,int n, int k, - RealD alpha, - deviceVector &Amk, // pointer list to matrices - deviceVector &Bkn, - RealD beta, - deviceVector &Cmn) - { - RealD t2=usecond(); - int32_t batchCount = Amk.size(); - - int lda = m; // m x k column major - int ldb = k; // k x n column major - int ldc = m; // m x b column major - if(OpA!=GridBLAS_OP_N) - lda = k; - if(OpB!=GridBLAS_OP_N) - ldb = n; - - static deviceVector alpha_p(1); - static deviceVector beta_p(1); - // can prestore the 1 and the zero on device - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); - RealD t0=usecond(); - - assert(Bkn.size()==batchCount); - assert(Cmn.size()==batchCount); -#ifdef GRID_HIP - hipblasOperation_t hOpA; - hipblasOperation_t hOpB; - if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; - if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; - if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; - if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; - if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; - if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; - auto err = hipblasDgemmBatched(gridblasHandle, - HIPBLAS_OP_N, - HIPBLAS_OP_N, - m,n,k, - (double *) &alpha_p[0], - (double **)&Amk[0], lda, - (double **)&Bkn[0], ldb, - (double *) &beta_p[0], - (double **)&Cmn[0], ldc, - batchCount); - assert(err==HIPBLAS_STATUS_SUCCESS); -#endif -#ifdef GRID_CUDA - cublasOperation_t hOpA; - cublasOperation_t hOpB; - if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; - if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; - if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; - if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; - if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; - if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; - auto err = cublasDgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (double *) &alpha_p[0], - (double **)&Amk[0], lda, - (double **)&Bkn[0], ldb, - (double *) &beta_p[0], - (double **)&Cmn[0], ldc, - batchCount); - assert(err==CUBLAS_STATUS_SUCCESS); -#endif -#ifdef GRID_SYCL - /* - int64_t m64=m; - int64_t n64=n; - int64_t k64=k; - int64_t batchCount64=batchCount; - oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, - onemkl::transpose::N, - onemkl::transpose::N, - &m64,&n64,&k64, - (double *) &alpha_p[0], - (double **)&Amk[0], lda, - (double **)&Bkn[0], ldb, - (double *) &beta_p[0], - (double **)&Cmn[0], ldc, - 1,&batchCount64); - */ - //MKL’s cblas_gemm_batch & OneAPI -#warning "oneMKL implementation not built " -#endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) - // Need a default/reference implementation - for (int p = 0; p < batchCount; ++p) { - for (int mm = 0; mm < m; ++mm) { - for (int nn = 0; nn < n; ++nn) { - RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; - } - } - } -#endif - RealD t1=usecond(); - RealD flops = 2.0*m*n*k*batchCount; - RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; - } - - - - //////////////////////////////////////////////////////////////////////////////////////////////// - // Strided case used by benchmark, but generally unused in Grid - // Keep a code example in double complex, but don't generate the single and real variants for now - //////////////////////////////////////////////////////////////////////////////////////////////// - - void gemmStridedBatched(int m,int n, int k, - ComplexD alpha, - ComplexD* Amk, // pointer list to matrices - ComplexD* Bkn, - ComplexD beta, - ComplexD* Cmn, - int batchCount) - { - // Use C-row major storage, so transpose calls - int lda = m; // m x k column major - int ldb = k; // k x n column major - int ldc = m; // m x b column major - int sda = m*k; - int sdb = k*n; - int sdc = m*n; - deviceVector alpha_p(1); - deviceVector beta_p(1); - acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); - acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); - std::cout << "blasZgemmStridedBatched mnk "< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); - deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); - deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); - ComplexD alpha(1.0); - ComplexD beta (1.0); - for(int i=0;i<10;i++){ - RealD t0 = usecond(); - for(int s=0;s /* END LEGAL */ #pragma once -#include NAMESPACE_BEGIN(Grid); diff --git a/Grid/algorithms/multigrid/MultiGrid.h b/Grid/algorithms/multigrid/MultiGrid.h index 6a896424..c7e67f39 100644 --- a/Grid/algorithms/multigrid/MultiGrid.h +++ b/Grid/algorithms/multigrid/MultiGrid.h @@ -29,7 +29,6 @@ Author: Peter Boyle #include #include -#include #include #include #include