From b75cb7a12c04cedfe4b106caec71373ddbf1aac0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Dec 2023 12:31:33 -0500 Subject: [PATCH] Blas batched partial implementation on Frontier only for now --- Grid/algorithms/multigrid/BatchedBlas.h | 278 ++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 Grid/algorithms/multigrid/BatchedBlas.h diff --git a/Grid/algorithms/multigrid/BatchedBlas.h b/Grid/algorithms/multigrid/BatchedBlas.h new file mode 100644 index 00000000..0663a715 --- /dev/null +++ b/Grid/algorithms/multigrid/BatchedBlas.h @@ -0,0 +1,278 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: BatchedBlas.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#ifdef GRID_HIP +#include +#endif +#ifdef GRID_CUDA +#include +#endif +#ifdef GRID_SYCL +#error // need oneMKL version +#endif + +/////////////////////////////////////////////////////////////////////// +// Need to rearrange lattice data to be in the right format for a +// batched multiply. Might as well make these static, dense packed +/////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); +#ifdef GRID_HIP + typedef hipblasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_CUDA + typedef cudablasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_SYCL + typedef int32_t gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + typedef int32_t gridblasHandle_t; +#endif + +class GridBLAS { +public: + + static gridblasHandle_t gridblasHandle; + static int gridblasInit; + + static void Init(void) + { + if ( ! gridblasInit ) { +#ifdef GRID_CUDA + std::cout << "cublasCreate"< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); + deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); + deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); + ComplexD alpha(1.0); + ComplexD beta (1.0); + for(int i=0;i<10;i++){ + RealD t0 = usecond(); + for(int s=0;s &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + RealD t0=usecond(); +#ifdef GRID_HIP + std::cout << "hipblasZgemmBatched mnk "< alpha_p(1); + deviceVector beta_p(1); + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); +#ifdef GRID_HIP + std::cout << "hipblasZgemmStridedBatched mnk "<