mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-12 16:55:37 +00:00
Compare commits
20 Commits
2c54be651c
...
66a1b63aa9
Author | SHA1 | Date | |
---|---|---|---|
|
66a1b63aa9 | ||
|
22c611bd1a | ||
|
c9bb1bf8ea | ||
|
9e489887cf | ||
|
9feb801bb9 | ||
|
c00b495933 | ||
|
d22eebe553 | ||
|
8bcbd82680 | ||
|
dfa617c439 | ||
|
48d1f0df89 | ||
|
b75cb7a12c | ||
|
332563e037 | ||
|
0cce97a4fe | ||
|
95a8e4be64 | ||
|
abcd6b8cb6 | ||
|
e8f21c9b6d | ||
|
e054078b11 | ||
|
6835a7f208 | ||
|
f59993b979 | ||
|
2290b8f680 |
541
Grid/algorithms/multigrid/BatchedBlas.h
Normal file
541
Grid/algorithms/multigrid/BatchedBlas.h
Normal file
@ -0,0 +1,541 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: BatchedBlas.h
|
||||
|
||||
Copyright (C) 2023
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifdef GRID_HIP
|
||||
#include <hipblas/hipblas.h>
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
#include <hipblas/hipblas.h>
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
#error // need oneMKL version
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Need to rearrange lattice data to be in the right format for a
|
||||
// batched multiply. Might as well make these static, dense packed
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#ifdef GRID_HIP
|
||||
typedef hipblasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
typedef cudablasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
typedef int32_t gridblasHandle_t;
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
typedef int32_t gridblasHandle_t;
|
||||
#endif
|
||||
|
||||
class GridBLAS {
|
||||
public:
|
||||
|
||||
static gridblasHandle_t gridblasHandle;
|
||||
static int gridblasInit;
|
||||
|
||||
static void Init(void)
|
||||
{
|
||||
if ( ! gridblasInit ) {
|
||||
#ifdef GRID_CUDA
|
||||
std::cout << "cublasCreate"<<std::endl;
|
||||
cublasCreate(&gridblasHandle);
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
std::cout << "hipblasCreate"<<std::endl;
|
||||
hipblasCreate(&gridblasHandle);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// Force construct once
|
||||
GridBLAS() { Init(); };
|
||||
~GridBLAS() { };
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////
|
||||
// BLAS GEMM conventions:
|
||||
/////////////////////////////////////////////////////////////////////////////////////
|
||||
// - C = alpha A * B + beta C
|
||||
// Dimensions:
|
||||
// - C_m.n
|
||||
// - A_m.k
|
||||
// - B_k.n
|
||||
// - Flops = 8 M N K
|
||||
// - Bytes = 2*sizeof(word) * (MN+MK+KN)
|
||||
// M=60, N=12
|
||||
// Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
|
||||
/////////////////////////////////////////////////////////////////////////////////////
|
||||
void synchronise(void)
|
||||
{
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipDeviceSynchronize();
|
||||
assert(err==hipSuccess);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cudaDeviceSynchronize();
|
||||
assert(err==cudaSuccess);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
accelerator_barrier();
|
||||
#endif
|
||||
}
|
||||
void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
|
||||
{
|
||||
int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
|
||||
int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
|
||||
int32_t N_C = nbasis*nrhs*coarseVol*nstencil;
|
||||
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
||||
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
||||
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
||||
ComplexD alpha(1.0);
|
||||
ComplexD beta (1.0);
|
||||
for(int i=0;i<10;i++){
|
||||
RealD t0 = usecond();
|
||||
for(int s=0;s<nstencil;s++){
|
||||
gemmStridedBatched(nbasis,nrhs,nbasis,
|
||||
alpha,
|
||||
&A[0], // m x k
|
||||
&B[0], // k x n
|
||||
beta,
|
||||
&C[0], // m x n
|
||||
coarseVol);
|
||||
}
|
||||
synchronise();
|
||||
RealD t1 = usecond();
|
||||
RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
|
||||
std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void gemmBatched(int m,int n, int k,
|
||||
ComplexD alpha,
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
// Use C-row major storage, so transpose calls
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
static deviceVector<ComplexD> alpha_p(1);
|
||||
static deviceVector<ComplexD> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||
RealD t0=usecond();
|
||||
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgemmBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(hipblasDoubleComplex *) &alpha_p[0],
|
||||
(hipblasDoubleComplex **)&Amk[0], lda,
|
||||
(hipblasDoubleComplex **)&Bkn[0], ldb,
|
||||
(hipblasDoubleComplex *) &beta_p[0],
|
||||
(hipblasDoubleComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
// std::cout << " hipblas return code " <<(int)err<<std::endl;
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgemmBatched(gridblasHandle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
m,n,k,
|
||||
(cuDoubleComplex *) &alpha_p[0],
|
||||
(cuDoubleComplex **)&Amk[0], lda,
|
||||
(cuDoubleComplex **)&Bkn[0], ldb,
|
||||
(cuDoubleComplex *) &beta_p[0],
|
||||
(cuDoubleComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexD c_mn(0.0);
|
||||
for (int kk = 0; kk < k, ++kk)
|
||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
synchronise();
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
}
|
||||
|
||||
void gemmBatched(int m,int n, int k,
|
||||
ComplexF alpha,
|
||||
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexF*> &Bkn,
|
||||
ComplexF beta,
|
||||
deviceVector<ComplexF*> &Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
// Use C-row major storage, so transpose calls
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
static deviceVector<ComplexF> alpha_p(1);
|
||||
static deviceVector<ComplexF> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
||||
RealD t0=usecond();
|
||||
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgemmBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(hipblasComplex *) &alpha_p[0],
|
||||
(hipblasComplex **)&Amk[0], lda,
|
||||
(hipblasComplex **)&Bkn[0], ldb,
|
||||
(hipblasComplex *) &beta_p[0],
|
||||
(hipblasComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
// std::cout << " hipblas return code " <<(int)err<<std::endl;
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgemmBatched(gridblasHandle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexD c_mn(0.0);
|
||||
for (int kk = 0; kk < k, ++kk)
|
||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
synchronise();
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Single precision real GEMM
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void gemmBatched(int m,int n, int k,
|
||||
RealF alpha,
|
||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||
deviceVector<RealF*> &Bkn,
|
||||
RealF beta,
|
||||
deviceVector<RealF*> &Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
// Use C-row major storage, so transpose calls
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
static deviceVector<RealF> alpha_p(1);
|
||||
static deviceVector<RealF> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
||||
RealD t0=usecond();
|
||||
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasSgemmBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(float *) &alpha_p[0],
|
||||
(float **)&Amk[0], lda,
|
||||
(float **)&Bkn[0], ldb,
|
||||
(float *) &beta_p[0],
|
||||
(float **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasSgemmBatched(gridblasHandle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
m,n,k,
|
||||
(float *) &alpha_p[0],
|
||||
(float **)&Amk[0], lda,
|
||||
(float **)&Bkn[0], ldb,
|
||||
(float *) &beta_p[0],
|
||||
(float **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
RealD c_mn(0.0);
|
||||
for (int kk = 0; kk < k, ++kk)
|
||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
synchronise();
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Double precision real GEMM
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void gemmBatched(int m,int n, int k,
|
||||
RealD alpha,
|
||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||
deviceVector<RealD*> &Bkn,
|
||||
RealD beta,
|
||||
deviceVector<RealD*> &Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
// Use C-row major storage, so transpose calls
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
static deviceVector<RealD> alpha_p(1);
|
||||
static deviceVector<RealD> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
||||
RealD t0=usecond();
|
||||
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasDgemmBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(double *) &alpha_p[0],
|
||||
(double **)&Amk[0], lda,
|
||||
(double **)&Bkn[0], ldb,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasDgemmBatched(gridblasHandle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
m,n,k,
|
||||
(double *) &alpha_p[0],
|
||||
(double **)&Amk[0], lda,
|
||||
(double **)&Bkn[0], ldb,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
/*
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t batchCount64=batchCount;
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
|
||||
onemkl::transpose::N,
|
||||
onemkl::transpose::N,
|
||||
&m64,&n64,&k64,
|
||||
(double *) &alpha_p[0],
|
||||
(double **)&Amk[0], lda,
|
||||
(double **)&Bkn[0], ldb,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], ldc,
|
||||
1,&batchCount64);
|
||||
*/
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
RealD c_mn(0.0);
|
||||
for (int kk = 0; kk < k, ++kk)
|
||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
synchronise();
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Strided case used by benchmark, but generally unused in Grid
|
||||
// Keep a code example in double complex, but don't generate the single and real variants for now
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void gemmStridedBatched(int m,int n, int k,
|
||||
ComplexD alpha,
|
||||
ComplexD* Amk, // pointer list to matrices
|
||||
ComplexD* Bkn,
|
||||
ComplexD beta,
|
||||
ComplexD* Cmn,
|
||||
int batchCount)
|
||||
{
|
||||
// Use C-row major storage, so transpose calls
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
int sda = m*k;
|
||||
int sdb = k*n;
|
||||
int sdc = m*n;
|
||||
deviceVector<ComplexD> alpha_p(1);
|
||||
deviceVector<ComplexD> beta_p(1);
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||
std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||
std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
||||
std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(hipblasDoubleComplex *) &alpha_p[0],
|
||||
(hipblasDoubleComplex *) Amk, lda, sda,
|
||||
(hipblasDoubleComplex *) Bkn, ldb, sdb,
|
||||
(hipblasDoubleComplex *) &beta_p[0],
|
||||
(hipblasDoubleComplex *) Cmn, ldc, sdc,
|
||||
batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
cublasZgemmStridedBatched(gridblasHandle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
m,n,k,
|
||||
(cuDoubleComplex *) &alpha_p[0],
|
||||
(cuDoubleComplex *) Amk, lda, sda,
|
||||
(cuDoubleComplex *) Bkn, ldb, sdb,
|
||||
(cuDoubleComplex *) &beta_p[0],
|
||||
(cuDoubleComplex *) Cmn, ldc, sdc,
|
||||
batchCount);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
#warning "oneMKL implementation not made "
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexD c_mn(0.0);
|
||||
for (int kk = 0; kk < k, ++kk)
|
||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -223,7 +223,7 @@ public:
|
||||
text+=usecond();
|
||||
ttot+=usecond();
|
||||
|
||||
std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
||||
std::cout << GridLogPerformance<<" of which mult2 "<<tmult2<<" us"<<std::endl;
|
||||
@ -232,8 +232,9 @@ public:
|
||||
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||
// std::cout << GridLogPerformance<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||
|
||||
@ -420,11 +421,6 @@ public:
|
||||
tinv+=usecond();
|
||||
}
|
||||
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
Coordinate coor({0,0,0,0,0});
|
||||
auto sval = peekSite(_A[p],coor);
|
||||
}
|
||||
|
||||
// Only needed if nonhermitian
|
||||
if ( ! hermitian ) {
|
||||
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
|
||||
|
@ -27,8 +27,26 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#include <Grid/algorithms/multigrid/BatchedBlas.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
// Move this to accelerator.h
|
||||
// Also give a copy device.
|
||||
// Rename acceleratorPut
|
||||
// Rename acceleratorGet
|
||||
template<class T> void deviceSet(T& dev,T&host)
|
||||
{
|
||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||
}
|
||||
template<class T> T deviceGet(T& dev)
|
||||
{
|
||||
T host;
|
||||
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||
return host;
|
||||
}
|
||||
|
||||
// Fine Object == (per site) type of fine field
|
||||
// nbasis == number of deflation vectors
|
||||
template<class Fobj,class CComplex,int nbasis>
|
||||
@ -40,6 +58,7 @@ public:
|
||||
|
||||
typedef iVector<CComplex,nbasis > siteVector;
|
||||
typedef iMatrix<CComplex,nbasis > siteMatrix;
|
||||
typedef iVector<SComplex,nbasis > calcVector;
|
||||
typedef iMatrix<SComplex,nbasis > calcMatrix;
|
||||
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
|
||||
typedef Lattice<siteVector> CoarseVector;
|
||||
@ -59,10 +78,14 @@ public:
|
||||
NonLocalStencilGeometry geom;
|
||||
PaddedCell Cell;
|
||||
GeneralLocalStencil Stencil;
|
||||
|
||||
std::vector<deviceVector<calcMatrix> > _A;
|
||||
std::vector<CoarseVector> MultTemporaries;
|
||||
deviceVector<GeneralStencilEntryReordered> StencilMasked;
|
||||
|
||||
deviceVector<calcVector> BLAS_B;
|
||||
deviceVector<calcVector> BLAS_C;
|
||||
std::vector<deviceVector<calcMatrix> > BLAS_A;
|
||||
|
||||
std::vector<deviceVector<ComplexD *> > BLAS_AP;
|
||||
std::vector<deviceVector<ComplexD *> > BLAS_BP;
|
||||
deviceVector<ComplexD *> BLAS_CP;
|
||||
|
||||
///////////////////////
|
||||
// Interface
|
||||
@ -76,58 +99,209 @@ public:
|
||||
_CoarseGridMulti(CoarseGridMulti),
|
||||
geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
|
||||
Cell(Op.geom.Depth(),_CoarseGridMulti),
|
||||
Stencil(Cell.grids.back(),geom.shifts)
|
||||
Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
|
||||
{
|
||||
_A.resize(geom.npoint);
|
||||
int32_t padded_sites = _Op._A[0].Grid()->lSites();
|
||||
int32_t padded_sites = _Op._A[0].Grid()->lSites();
|
||||
int32_t unpadded_sites = _CoarseGrid->lSites();
|
||||
|
||||
int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
|
||||
int32_t orhs = nrhs/CComplex::Nsimd();
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Device data vector storage
|
||||
/////////////////////////////////////////////////
|
||||
BLAS_A.resize(geom.npoint);
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
_A[p].resize(padded_sites);
|
||||
BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
|
||||
}
|
||||
std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
|
||||
BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
|
||||
BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
|
||||
|
||||
StencilMasked.resize(_CoarseGridMulti->oSites()*geom.npoint);
|
||||
std::vector<GeneralStencilEntryReordered> StencilTmp;
|
||||
BLAS_AP.resize(geom.npoint);
|
||||
BLAS_BP.resize(geom.npoint);
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
BLAS_AP[p].resize(unpadded_sites);
|
||||
BLAS_BP[p].resize(unpadded_sites);
|
||||
}
|
||||
BLAS_CP.resize(unpadded_sites);
|
||||
|
||||
int32_t j=0;
|
||||
int32_t sites = Stencil._entries.size()/geom.npoint;
|
||||
for(int32_t s=0;s<sites;s++){
|
||||
/////////////////////////////////////////////////
|
||||
// Pointers to data
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
// Site identity mapping for A, C
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
for(int ss=0;ss<unpadded_sites;ss++){
|
||||
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
|
||||
//ComplexD *ptr = (ComplexD *)&BLAS_A[p][0]; std::cout << " A ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_A[p].size()<<std::endl;
|
||||
deviceSet(BLAS_AP[p][ss],ptr);
|
||||
}
|
||||
}
|
||||
for(int ss=0;ss<unpadded_sites;ss++){
|
||||
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
|
||||
//ComplexD *ptr = (ComplexD *)&BLAS_C[0]; std::cout << " C ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_C.size()<<std::endl;
|
||||
deviceSet(BLAS_CP[ss],ptr);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Neighbour table is more complicated
|
||||
/////////////////////////////////////////////////
|
||||
int32_t j=0; // Interior point counter (unpadded)
|
||||
for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
|
||||
int ghost_zone=0;
|
||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||
int i=s*geom.npoint+point;
|
||||
if( Stencil._entries[i]._wrap ) {
|
||||
ghost_zone=1;
|
||||
int i=s*orhs*geom.npoint+point;
|
||||
if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
|
||||
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
|
||||
}
|
||||
}
|
||||
// std::cout << "site " <<s<<"/"<<sites <<" ghost_zone "<<ghost_zone<<std::endl;
|
||||
GeneralStencilEntryReordered tmp;
|
||||
// GeneralStencilEntryReordered tmp;
|
||||
if( ghost_zone==0) {
|
||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||
int i=s*geom.npoint+point;
|
||||
tmp._offset = Stencil._entries[i]._offset;
|
||||
tmp._wrap= Stencil._entries[i]._wrap; // Should be no premute and j=site
|
||||
tmp._input = s;
|
||||
StencilTmp.push_back(tmp);
|
||||
int i=s*orhs*geom.npoint+point;
|
||||
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
|
||||
// std::cout << " B ptr "<< nbr<<"/"<<BLAS_B.size()<<std::endl;
|
||||
assert(nbr<BLAS_B.size());
|
||||
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
|
||||
// ComplexD * ptr = (ComplexD *)&BLAS_B[0];
|
||||
// std::cout << " B ptr unpadded "<<std::hex<<ptr<<std::dec<<" "<<s<<"/"<<padded_sites<<std::endl;
|
||||
// std::cout << " B ptr padded "<<std::hex<<ptr<<std::dec<<" "<<j<<"/"<<unpadded_sites<<std::endl;
|
||||
deviceSet(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
|
||||
// auto tmp = deviceGet(*BLAS_BP[point][j]); // debug trigger SEGV if bad ptr
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
std::cout << " oSites " << _CoarseGridMulti->oSites()<<std::endl;
|
||||
std::cout << " npoint " << geom.npoint<<std::endl;
|
||||
std::cout << " StencilTmp "<<StencilTmp.size()<<std::endl;
|
||||
|
||||
assert(_CoarseGridMulti->oSites()*geom.npoint==StencilTmp.size());
|
||||
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
||||
assert(j==unpadded_sites);
|
||||
CopyMatrix();
|
||||
}
|
||||
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
|
||||
{
|
||||
#if 0
|
||||
std::vector<typename vobj::scalar_object> tmp;
|
||||
unvectorizeToLexOrdArray(tmp,from);
|
||||
assert(tmp.size()==from.Grid()->lSites());
|
||||
assert(tmp.size()==to.size());
|
||||
to.resize(tmp.size());
|
||||
acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
|
||||
#else
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
GridBase *Fg = from.Grid();
|
||||
assert(!Fg->_isCheckerBoarded);
|
||||
int nd = Fg->_ndimension;
|
||||
|
||||
to.resize(Fg->lSites());
|
||||
|
||||
Coordinate LocalLatt = Fg->LocalDimensions();
|
||||
size_t nsite = 1;
|
||||
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// do the index calc on the GPU
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Coordinate f_ostride = Fg->_ostride;
|
||||
Coordinate f_istride = Fg->_istride;
|
||||
Coordinate f_rdimensions = Fg->_rdimensions;
|
||||
|
||||
autoView(from_v,from,AcceleratorRead);
|
||||
auto to_v = &to[0];
|
||||
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
accelerator_for(idx,nsite,1,{
|
||||
|
||||
Coordinate from_coor, base;
|
||||
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
|
||||
for(int i=0;i<nd;i++){
|
||||
from_coor[i] = base[i];
|
||||
}
|
||||
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
||||
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
||||
|
||||
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
||||
scalar_type* to = (scalar_type *)&to_v[idx];
|
||||
|
||||
scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
stmp = getlane(from[w], from_lane);
|
||||
to[w] = stmp;
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
|
||||
{
|
||||
#if 0
|
||||
std::vector<typename vobj::scalar_object> tmp;
|
||||
tmp.resize(in.size());
|
||||
// std::cout << "BLAStoGrid volume " <<tmp.size()<<" "<< grid.Grid()->lSites()<<std::endl;
|
||||
assert(in.size()==grid.Grid()->lSites());
|
||||
acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
|
||||
vectorizeFromLexOrdArray(tmp,grid);
|
||||
#else
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
GridBase *Tg = grid.Grid();
|
||||
assert(!Tg->_isCheckerBoarded);
|
||||
int nd = Tg->_ndimension;
|
||||
|
||||
assert(in.size()==Tg->lSites());
|
||||
|
||||
Coordinate LocalLatt = Tg->LocalDimensions();
|
||||
size_t nsite = 1;
|
||||
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// do the index calc on the GPU
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Coordinate t_ostride = Tg->_ostride;
|
||||
Coordinate t_istride = Tg->_istride;
|
||||
Coordinate t_rdimensions = Tg->_rdimensions;
|
||||
|
||||
autoView(to_v,grid,AcceleratorWrite);
|
||||
auto from_v = &in[0];
|
||||
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
accelerator_for(idx,nsite,1,{
|
||||
|
||||
Coordinate to_coor, base;
|
||||
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
|
||||
for(int i=0;i<nd;i++){
|
||||
to_coor[i] = base[i];
|
||||
}
|
||||
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
||||
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
||||
|
||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||
scalar_type* from = (scalar_type *)&from_v[idx];
|
||||
|
||||
scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
stmp=from[w];
|
||||
putlane(to[w], stmp, to_lane);
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
void CopyMatrix (void)
|
||||
{
|
||||
// Clone "A" to be lexicographic in the physics coords
|
||||
// Use unvectorisetolexordarray
|
||||
// Copy to device
|
||||
std::vector<calcMatrix> tmp;
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
unvectorizeToLexOrdArray(tmp,_Op._A[p]);
|
||||
acceleratorCopyToDevice(&tmp[0],&_A[p][0],sizeof(calcMatrix)*tmp.size());
|
||||
//Unpadded
|
||||
auto Aup = _Op.Cell.Extract(_Op._A[p]);
|
||||
// Coordinate coor({0,0,0,0,0});
|
||||
// auto sval = peekSite(Aup,coor);
|
||||
// std::cout << "CopyMatrix: p "<<p<<" Aup[0] :"<<sval<<std::endl;
|
||||
// sval = peekSite(_Op._A[p],coor);
|
||||
// std::cout << "CopyMatrix: p "<<p<<" _Op._Ap[0] :"<<sval<<std::endl;
|
||||
GridtoBLAS(Aup,BLAS_A[p]);
|
||||
// std::cout << "Copy Matrix p "<<p<<" "<< deviceGet(BLAS_A[p][0])<<std::endl;
|
||||
}
|
||||
}
|
||||
void Mdag(const CoarseVector &in, CoarseVector &out)
|
||||
@ -136,18 +310,23 @@ public:
|
||||
}
|
||||
void M (const CoarseVector &in, CoarseVector &out)
|
||||
{
|
||||
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
|
||||
RealD tmult2=0;
|
||||
|
||||
ttot=-usecond();
|
||||
std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
|
||||
conformable(CoarseGrid(),in.Grid());
|
||||
conformable(in.Grid(),out.Grid());
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
CoarseVector tin=in;
|
||||
|
||||
texch-=usecond();
|
||||
CoarseVector pin = Cell.ExchangePeriodic(tin);
|
||||
texch+=usecond();
|
||||
RealD t_tot;
|
||||
RealD t_exch;
|
||||
RealD t_GtoB;
|
||||
RealD t_BtoG;
|
||||
RealD t_mult;
|
||||
|
||||
t_tot=-usecond();
|
||||
CoarseVector tin=in;
|
||||
t_exch=-usecond();
|
||||
CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
|
||||
t_exch+=usecond();
|
||||
|
||||
CoarseVector pout(pin.Grid());
|
||||
|
||||
int npoint = geom.npoint;
|
||||
@ -157,192 +336,61 @@ public:
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
|
||||
RealD flops,bytes;
|
||||
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd;
|
||||
int64_t osites=in.Grid()->oSites(); // unpadded
|
||||
int64_t unpadded_vol = _CoarseGrid->lSites();
|
||||
|
||||
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||
|
||||
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
|
||||
assert(nrhs>=1);
|
||||
|
||||
#if 0
|
||||
{
|
||||
tviews-=usecond();
|
||||
autoView( in_v , pin, AcceleratorRead);
|
||||
autoView( out_v , pout, AcceleratorWriteDiscard);
|
||||
tviews+=usecond();
|
||||
std::cout << GridLogMessage << "New Mrhs GridtoBLAS in sizes "<<in.Grid()->lSites()<<" "<<pin.Grid()->lSites()<<std::endl;
|
||||
t_GtoB=-usecond();
|
||||
GridtoBLAS(pin,BLAS_B);
|
||||
// out = Zero();
|
||||
// GridtoBLAS(out,BLAS_C);
|
||||
t_GtoB+=usecond();
|
||||
|
||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
||||
ttemps-=usecond();
|
||||
MultTemporaries.resize(npoint,in.Grid());
|
||||
ttemps+=usecond();
|
||||
GridBLAS BLAS;
|
||||
|
||||
std::vector<Aview> AcceleratorViewContainer_h;
|
||||
std::vector<Vview> AcceleratorVecViewContainer_h;
|
||||
|
||||
tviews-=usecond();
|
||||
for(int p=0;p<npoint;p++) {
|
||||
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
||||
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
||||
}
|
||||
tviews+=usecond();
|
||||
|
||||
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
||||
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
||||
|
||||
auto Aview_p = &AcceleratorViewContainer[0];
|
||||
auto Vview_p = &AcceleratorVecViewContainer[0];
|
||||
|
||||
tcopy-=usecond();
|
||||
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
||||
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
||||
tcopy+=usecond();
|
||||
|
||||
int32_t bound = _A[0].size();
|
||||
int64_t osites=pin.Grid()->oSites();
|
||||
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||
|
||||
// std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
||||
// std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||
// std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||
tmult-=usecond();
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||
int32_t ss = rspb/(nbasis*npoint);
|
||||
int32_t bp = rspb%(nbasis*npoint);
|
||||
int32_t point= bp/nbasis;
|
||||
int32_t b = bp%nbasis;
|
||||
assert(ss<bound);
|
||||
auto SE = Stencil_v.GetEntry(point,ss);
|
||||
if ( SE->_permute == 0 ) {
|
||||
int32_t snbr= SE->_offset;
|
||||
auto nbr = coalescedReadGeneralPermute(in_v[snbr],SE->_permute,Nd);
|
||||
auto res = Aview_p[point][ss](0,b)*nbr(0);
|
||||
for(int bb=1;bb<nbasis;bb++) {
|
||||
res = res + Aview_p[point][ss](bb,b)*nbr(bb);
|
||||
}
|
||||
coalescedWrite(Vview_p[point][ss](b),res);
|
||||
}
|
||||
});
|
||||
tmult2-=usecond();
|
||||
accelerator_for(sb, osites*nbasis, Nsimd, {
|
||||
int ss = sb/nbasis;
|
||||
int b = sb%nbasis;
|
||||
auto res = coalescedRead(Vview_p[0][ss](b));
|
||||
for(int point=1;point<npoint;point++){
|
||||
res = res + coalescedRead(Vview_p[point][ss](b));
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
tmult2+=usecond();
|
||||
tmult+=usecond();
|
||||
|
||||
for(int p=0;p<npoint;p++) {
|
||||
AcceleratorVecViewContainer_h[p].ViewClose();
|
||||
}
|
||||
t_mult=-usecond();
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
RealD c = 1.0;
|
||||
if (p==0) c = 0.0;
|
||||
ComplexD beta(c);
|
||||
// std::cout << GridLogMessage << "New Mrhs coarse gemmBatched "<<p<<std::endl;
|
||||
BLAS.gemmBatched(nbasis,nrhs,nbasis,
|
||||
ComplexD(1.0),
|
||||
BLAS_AP[p],
|
||||
BLAS_BP[p],
|
||||
ComplexD(c),
|
||||
BLAS_CP);
|
||||
}
|
||||
|
||||
text-=usecond();
|
||||
out = Cell.Extract(pout);
|
||||
text+=usecond();
|
||||
ttot+=usecond();
|
||||
#else
|
||||
{
|
||||
tviews-=usecond();
|
||||
autoView( in_v , pin, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWriteDiscard);
|
||||
tviews+=usecond();
|
||||
|
||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
||||
ttemps-=usecond();
|
||||
MultTemporaries.resize(npoint,in.Grid());
|
||||
ttemps+=usecond();
|
||||
|
||||
std::vector<Aview> AcceleratorViewContainer_h;
|
||||
std::vector<Vview> AcceleratorVecViewContainer_h;
|
||||
|
||||
tviews-=usecond();
|
||||
for(int p=0;p<npoint;p++) {
|
||||
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
||||
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
||||
}
|
||||
tviews+=usecond();
|
||||
|
||||
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
||||
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
||||
|
||||
auto Aview_p = &AcceleratorViewContainer[0];
|
||||
auto Vview_p = &AcceleratorVecViewContainer[0];
|
||||
|
||||
tcopy-=usecond();
|
||||
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
||||
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
||||
tcopy+=usecond();
|
||||
|
||||
int32_t bound = _A[0].size();
|
||||
int64_t osites=in.Grid()->oSites();
|
||||
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||
|
||||
// std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize "<<StencilMasked.size()<<std::endl;
|
||||
// std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||
// std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||
tmult-=usecond();
|
||||
auto Stencil_v = &StencilMasked[0];
|
||||
accelerator_for(rspb, StencilMasked.size()*nbasis, Nsimd, {
|
||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||
int32_t ss = rspb/(nbasis*npoint); // site of unpadded
|
||||
int32_t bp = rspb%(nbasis*npoint);
|
||||
int32_t point= bp/nbasis;
|
||||
int32_t b = bp%nbasis;
|
||||
auto SE = &Stencil_v[ss*npoint+point];
|
||||
int32_t s = SE->_input; // site of padded
|
||||
int32_t snbr= SE->_offset;
|
||||
auto nbr = coalescedRead(in_v[snbr]);
|
||||
auto res = Aview_p[point][s](0,b)*nbr(0);
|
||||
for(int bb=1;bb<nbasis;bb++) {
|
||||
res = res + Aview_p[point][s](bb,b)*nbr(bb);
|
||||
}
|
||||
// std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<" "<< innerProduct(res,res) <<std::endl;
|
||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" res "<< innerProduct(res,res) <<std::endl;
|
||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbrIP "<< innerProduct(nbr,nbr) <<std::endl;
|
||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< nbr <<std::endl;
|
||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< in_v[snbr] <<std::endl;
|
||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" A "<< innerProduct(Aview_p[point][s],Aview_p[point][s]) <<std::endl;
|
||||
coalescedWrite(Vview_p[point][ss](b),res);
|
||||
});
|
||||
tmult2-=usecond();
|
||||
accelerator_for(sb, osites*nbasis, Nsimd, {
|
||||
int ss = sb/nbasis;
|
||||
int b = sb%nbasis;
|
||||
auto res = coalescedRead(Vview_p[0][ss](b));
|
||||
for(int point=1;point<npoint;point++){
|
||||
res = res + coalescedRead(Vview_p[point][ss](b));
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
tmult2+=usecond();
|
||||
tmult+=usecond();
|
||||
for(int p=0;p<npoint;p++) {
|
||||
AcceleratorVecViewContainer_h[p].ViewClose();
|
||||
}
|
||||
}
|
||||
ttot+=usecond();
|
||||
#endif
|
||||
|
||||
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<" of which mult2 "<<tmult2<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||
// std::cout << GridLogMessage<<std::endl;
|
||||
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
||||
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||
|
||||
t_mult+=usecond();
|
||||
// std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
|
||||
t_BtoG=-usecond();
|
||||
BLAStoGrid(out,BLAS_C);
|
||||
t_BtoG+=usecond();
|
||||
t_tot+=usecond();
|
||||
// auto check =deviceGet(BLAS_C[0]);
|
||||
// std::cout << "C[0] "<<check<<std::endl;
|
||||
// Coordinate coor({0,0,0,0,0,0});
|
||||
// peekLocalSite(check,out,coor);
|
||||
// std::cout << "C[0] "<< check<<std::endl;
|
||||
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||
};
|
||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
||||
|
@ -29,6 +29,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
#include <Grid/algorithms/multigrid/Aggregates.h>
|
||||
#include <Grid/algorithms/multigrid/Geometry.h>
|
||||
#include <Grid/algorithms/multigrid/BatchedBlas.h>
|
||||
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
|
||||
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
|
||||
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
|
||||
|
@ -745,6 +745,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// the checks should guarantee that the operations are local
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
GridBase *Fg = From.Grid();
|
||||
GridBase *Tg = To.Grid();
|
||||
assert(!Fg->_isCheckerBoarded);
|
||||
@ -758,41 +761,12 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
for(int d=0;d<nd;d++){
|
||||
assert(Fg->_processors[d] == Tg->_processors[d]);
|
||||
}
|
||||
// the above should guarantee that the operations are local
|
||||
|
||||
#if 1
|
||||
size_t nsite = 1;
|
||||
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
||||
|
||||
size_t tbytes = 4*nsite*sizeof(int);
|
||||
int *table = (int*)malloc(tbytes);
|
||||
|
||||
RealD t_cpu=-usecond();
|
||||
#if 0
|
||||
thread_for(idx, nsite, {
|
||||
Coordinate from_coor, to_coor;
|
||||
size_t rem = idx;
|
||||
for(int i=0;i<nd;i++){
|
||||
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
||||
from_coor[i] = base_i + FromLowerLeft[i];
|
||||
to_coor[i] = base_i + ToLowerLeft[i];
|
||||
}
|
||||
|
||||
int foidx = Fg->oIndex(from_coor);
|
||||
int fiidx = Fg->iIndex(from_coor);
|
||||
int toidx = Tg->oIndex(to_coor);
|
||||
int tiidx = Tg->iIndex(to_coor);
|
||||
int* tt = table + 4*idx;
|
||||
tt[0] = foidx;
|
||||
tt[1] = fiidx;
|
||||
tt[2] = toidx;
|
||||
tt[3] = tiidx;
|
||||
});
|
||||
|
||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||
#else
|
||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// do the index calc on the GPU
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Coordinate f_ostride = Fg->_ostride;
|
||||
Coordinate f_istride = Fg->_istride;
|
||||
Coordinate f_rdimensions = Fg->_rdimensions;
|
||||
@ -800,105 +774,35 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
Coordinate t_istride = Tg->_istride;
|
||||
Coordinate t_rdimensions = Tg->_rdimensions;
|
||||
|
||||
accelerator_for(idx, nsite, 1, {
|
||||
Coordinate from_coor, to_coor;
|
||||
size_t rem = idx;
|
||||
for(int i=0;i<nd;i++){
|
||||
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
||||
from_coor[i] = base_i + FromLowerLeft[i];
|
||||
to_coor[i] = base_i + ToLowerLeft[i];
|
||||
}
|
||||
int foidx = 0; for(int d=0;d<nd;d++) foidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
||||
int fiidx = 0; for(int d=0;d<nd;d++) fiidx+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
||||
int toidx = 0; for(int d=0;d<nd;d++) toidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
||||
int tiidx = 0; for(int d=0;d<nd;d++) tiidx+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
||||
int* tt = table_d + 4*idx;
|
||||
tt[0] = foidx;
|
||||
tt[1] = fiidx;
|
||||
tt[2] = toidx;
|
||||
tt[3] = tiidx;
|
||||
});
|
||||
#endif
|
||||
t_cpu+=usecond();
|
||||
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
autoView(from_v,From,AcceleratorRead);
|
||||
autoView(to_v,To,AcceleratorWrite);
|
||||
RealD t_acc=-usecond();
|
||||
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
accelerator_for(idx,nsite,words,{
|
||||
int* tt = table_d + 4*idx;
|
||||
int from_oidx = *tt++;
|
||||
int from_lane = *tt++;
|
||||
int to_oidx = *tt++;
|
||||
int to_lane = *tt;
|
||||
accelerator_for(idx,nsite,1,{
|
||||
|
||||
Coordinate from_coor, to_coor, base;
|
||||
Lexicographic::CoorFromIndex(base,idx,RegionSize);
|
||||
for(int i=0;i<nd;i++){
|
||||
from_coor[i] = base[i] + FromLowerLeft[i];
|
||||
to_coor[i] = base[i] + ToLowerLeft[i];
|
||||
}
|
||||
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
||||
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
||||
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
||||
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
||||
|
||||
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||
|
||||
scalar_type stmp;
|
||||
#ifdef GRID_SIMT
|
||||
int w = acceleratorSIMTlane(words);
|
||||
stmp = getlane(from[w], from_lane);
|
||||
putlane(to[w], stmp, to_lane);
|
||||
#else
|
||||
for(int w=0;w<words;w++){
|
||||
stmp = getlane(from[w], from_lane);
|
||||
putlane(to[w], stmp, to_lane);
|
||||
}
|
||||
#endif
|
||||
});
|
||||
t_acc+=usecond();
|
||||
// std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
|
||||
// std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
|
||||
acceleratorFreeDevice(table_d);
|
||||
free(table);
|
||||
|
||||
|
||||
#else
|
||||
Coordinate ldf = Fg->_ldimensions;
|
||||
Coordinate rdf = Fg->_rdimensions;
|
||||
Coordinate isf = Fg->_istride;
|
||||
Coordinate osf = Fg->_ostride;
|
||||
Coordinate rdt = Tg->_rdimensions;
|
||||
Coordinate ist = Tg->_istride;
|
||||
Coordinate ost = Tg->_ostride;
|
||||
|
||||
autoView( t_v , To, CpuWrite);
|
||||
autoView( f_v , From, CpuRead);
|
||||
thread_for(idx,Fg->lSites(),{
|
||||
sobj s;
|
||||
Coordinate Fcoor(nd);
|
||||
Coordinate Tcoor(nd);
|
||||
Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
|
||||
int in_region=1;
|
||||
for(int d=0;d<nd;d++){
|
||||
if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){
|
||||
in_region=0;
|
||||
}
|
||||
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
|
||||
}
|
||||
if (in_region) {
|
||||
#if 0
|
||||
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
|
||||
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
|
||||
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
|
||||
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
|
||||
scalar_type * fp = (scalar_type *)&f_v[odx_f];
|
||||
scalar_type * tp = (scalar_type *)&t_v[odx_t];
|
||||
for(int w=0;w<words;w++){
|
||||
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
|
||||
}
|
||||
#else
|
||||
peekLocalSite(s,f_v,Fcoor);
|
||||
pokeLocalSite(s,t_v,Tcoor);
|
||||
#endif
|
||||
}
|
||||
});
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -990,7 +894,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
||||
|
||||
}
|
||||
|
||||
|
||||
//FIXME: make this run entirely on GPU
|
||||
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
|
||||
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
|
||||
template<class vobj>
|
||||
|
@ -91,7 +91,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||
//for cross platform
|
||||
// FIXME -- can put internal indices into thread loop
|
||||
auto buf_p = & buf[0];
|
||||
autoView(lat_v, lat, AcceleratorRead);
|
||||
autoView(lat_v, lat, AcceleratorWrite);
|
||||
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||
|
||||
// scalar layout won't coalesce
|
||||
@ -329,8 +329,6 @@ public:
|
||||
if(dim==0) conformable(old_grid,unpadded_grid);
|
||||
else conformable(old_grid,grids[dim-1]);
|
||||
|
||||
// std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
|
||||
|
||||
double tins=0, tshift=0;
|
||||
|
||||
int islocal = 0 ;
|
||||
@ -339,6 +337,7 @@ public:
|
||||
if ( islocal ) {
|
||||
|
||||
// replace with a copy and maybe grid swizzle
|
||||
// return in;??
|
||||
double t = usecond();
|
||||
padded = in;
|
||||
tins += usecond() - t;
|
||||
@ -396,7 +395,7 @@ public:
|
||||
GridBase *old_grid = in.Grid();
|
||||
GridCartesian *new_grid = grids[dim];//These are new grids
|
||||
Lattice<vobj> padded(new_grid);
|
||||
Lattice<vobj> shifted(old_grid);
|
||||
// Lattice<vobj> shifted(old_grid);
|
||||
Coordinate local =old_grid->LocalDimensions();
|
||||
Coordinate plocal =new_grid->LocalDimensions();
|
||||
if(dim==0) conformable(old_grid,unpadded_grid);
|
||||
@ -409,14 +408,10 @@ public:
|
||||
if ( processors[dim] == 1 ) islocal = 1;
|
||||
|
||||
if ( islocal ) {
|
||||
|
||||
// replace with a copy and maybe grid swizzle
|
||||
double t = usecond();
|
||||
padded = in;
|
||||
tins += usecond() - t;
|
||||
// return in; ?
|
||||
padded=in; // slightly different interface could avoid a copy operation
|
||||
} else {
|
||||
Face_exchange(in,padded,dim,depth);
|
||||
return padded;
|
||||
}
|
||||
return padded;
|
||||
}
|
||||
@ -527,8 +522,6 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Scatter all faces
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
|
||||
|
||||
plane=0;
|
||||
|
||||
t=usecond();
|
||||
@ -550,18 +543,16 @@ public:
|
||||
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
|
||||
}
|
||||
t_scatter+= usecond() - t;
|
||||
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
|
||||
t_tot+=usecond();
|
||||
|
||||
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -90,7 +90,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
||||
|
||||
for (int i = 0; i < logstreams.size(); i++) {
|
||||
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
|
||||
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(0);
|
||||
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
|
||||
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
|
||||
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
|
||||
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
|
||||
|
@ -6,7 +6,6 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
--enable-tracing=timer \
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-tracing=roctx \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-simd=GPU \
|
||||
@ -17,7 +16,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
--disable-fermion-reps \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
|
||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
|
||||
|
||||
|
||||
|
||||
|
@ -36,6 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
gridblasHandle_t GridBLAS::gridblasHandle;
|
||||
int GridBLAS::gridblasInit;
|
||||
|
||||
///////////////////////
|
||||
// Tells little dirac op to use MdagM as the .Op()
|
||||
///////////////////////
|
||||
@ -107,7 +110,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
|
||||
const int nbasis = 8;
|
||||
const int nbasis = 62;
|
||||
const int cb = 0 ;
|
||||
LatticeFermion prom(FGrid);
|
||||
|
||||
@ -136,13 +139,12 @@ int main (int argc, char ** argv)
|
||||
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
|
||||
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
||||
|
||||
NextToNearestStencilGeometry5D geom(Coarse5d);
|
||||
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
|
||||
LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse5d);
|
||||
LittleDiracOperator LittleDiracOpCol(geom,FGrid,Coarse5d);
|
||||
|
||||
HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);
|
||||
|
||||
int pp=16;
|
||||
LittleDiracOp.CoarsenOperator(HOA,Aggregates);
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
@ -229,17 +231,19 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Create a higher dim coarse grid
|
||||
const int nrhs=vComplex::Nsimd();
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const int nrhs=vComplex::Nsimd()*3;
|
||||
|
||||
Coordinate mpi=GridDefaultMpi();
|
||||
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
||||
Coordinate rhLatt({nrhs,1,clatt[0],clatt[2],clatt[2],clatt[3]});
|
||||
Coordinate rhSimd({nrhs,1, 1,1,1,1});
|
||||
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
|
||||
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
|
||||
|
||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||
|
||||
|
||||
MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
|
||||
|
||||
{
|
||||
@ -248,31 +252,48 @@ int main (int argc, char ** argv)
|
||||
CoarseVector rh_res(CoarseMrhs);
|
||||
random(rh_CRNG,rh_phi);
|
||||
|
||||
std::cout << "Warmup"<<std::endl;
|
||||
mrhs.M(rh_phi,rh_res);
|
||||
const int ncall=100;
|
||||
const int ncall=5;
|
||||
RealD t0=-usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
std::cout << "Call "<<i<<"/"<<ncall<<std::endl;
|
||||
mrhs.M(rh_phi,rh_res);
|
||||
}
|
||||
t0+=usecond();
|
||||
RealD t1=0;
|
||||
for(int r=0;r<nrhs;r++){
|
||||
std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
|
||||
ExtractSlice(phi,rh_phi,r,0);
|
||||
ExtractSlice(chi,rh_res,r,0);
|
||||
LittleDiracOp.M(phi,Aphi);
|
||||
t1-=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
std::cout << "Call "<<i<<"/"<<ncall<<std::endl;
|
||||
LittleDiracOp.M(phi,Aphi);
|
||||
}
|
||||
t1+=usecond();
|
||||
Coordinate site({0,0,0,0,0});
|
||||
auto bad = peekSite(chi,site);
|
||||
auto good = peekSite(Aphi,site);
|
||||
std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
|
||||
std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
|
||||
chi=chi-Aphi;
|
||||
std::cout << r << " diff " << norm2(chi)<<std::endl;
|
||||
RealD diff =norm2(chi);
|
||||
std::cout << r << " diff " << diff<<std::endl;
|
||||
assert(diff < 1.0e-10);
|
||||
}
|
||||
std::cout << nrhs<< " mrhs " << t0/ncall/nrhs <<" us"<<std::endl;
|
||||
std::cout << nrhs<< " srhs " << t1/ncall/nrhs <<" us"<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user