1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-14 01:35:36 +00:00

Batched blas, but not working yet on OneAPI

This commit is contained in:
Peter Boyle 2024-07-11 15:19:49 +00:00
parent f3eb36adcf
commit 9c902e4c2d

View File

@ -89,9 +89,10 @@ public:
gridblasHandle = theGridAccelerator; gridblasHandle = theGridAccelerator;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
cl::sycl::cpu_selector selector; cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector }; cl::sycl::device selectedDevice { selector };
gridblasHandle =new sycl::queue (selectedDevice); cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif #endif
gridblasInit=1; gridblasInit=1;
} }
@ -266,8 +267,46 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI std::cerr << " Calling SYCL batched ZGEMM "<<std::endl;
#warning "oneMKL implementation not built " int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&notransp,
&notransp,
&m64,&n64,&k64,
(ComplexD *) &alpha_p[0],
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexD *) &beta_p[0],
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
std::cerr << " Called SYCL batched ZGEMM "<<std::endl;
std::vector<ComplexD> A(m*k); // pointer list to matrices
std::vector<ComplexD> B(k*n);
std::vector<ComplexD> C(m*n);
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
for (int p = 0; p < 1; ++p) {
acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD));
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += A[mm + kk*lda ] * B[kk + nn*ldb];
std::cout << " beta "<<beta<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
}
}
}
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation // Need a default/reference implementation
@ -285,7 +324,6 @@ public:
} }
} }
#endif #endif
// synchronise();
RealD t1=usecond(); RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount; RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@ -366,8 +404,25 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI int64_t m64=m;
#warning "oneMKL implementation not built " int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&notransp,
&notransp,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k; int sda = lda*k;
@ -467,8 +522,25 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI int64_t m64=m;
#warning "oneMKL implementation not built " int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&notransp,
&notransp,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k; int sda = lda*k;
@ -568,24 +640,25 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
/*
int64_t m64=m; int64_t m64=m;
int64_t n64=n; int64_t n64=n;
int64_t k64=k; int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount; int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
onemkl::transpose::N, oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
onemkl::transpose::N, &notransp,
&m64,&n64,&k64, &notransp,
(double *) &alpha_p[0], &m64,&n64,&k64,
(double **)&Amk[0], lda, (double *) &alpha_p[0],
(double **)&Bkn[0], ldb, (const double **)&Amk[0], (const int64_t *)&lda64,
(double *) &beta_p[0], (const double **)&Bkn[0], (const int64_t *)&ldb64,
(double **)&Cmn[0], ldc, (double *) &beta_p[0],
1,&batchCount64); (double **)&Cmn[0], (const int64_t *)&ldc64,
*/ (int64_t)1,&batchCount64,std::vector<sycl::event>());
//MKLs cblas_<T>gemm_batch & OneAPI synchronise();
#warning "oneMKL implementation not built "
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k; int sda = lda*k;
@ -673,6 +746,7 @@ public:
beta, beta,
(ComplexD *)Cmn,ldc,sdc, (ComplexD *)Cmn,ldc,sdc,
batchCount); batchCount);
synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
// Need a default/reference implementation // Need a default/reference implementation