mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 09:45:36 +00:00
Compare commits
20 Commits
2c54be651c
...
66a1b63aa9
Author | SHA1 | Date | |
---|---|---|---|
|
66a1b63aa9 | ||
|
22c611bd1a | ||
|
c9bb1bf8ea | ||
|
9e489887cf | ||
|
9feb801bb9 | ||
|
c00b495933 | ||
|
d22eebe553 | ||
|
8bcbd82680 | ||
|
dfa617c439 | ||
|
48d1f0df89 | ||
|
b75cb7a12c | ||
|
332563e037 | ||
|
0cce97a4fe | ||
|
95a8e4be64 | ||
|
abcd6b8cb6 | ||
|
e8f21c9b6d | ||
|
e054078b11 | ||
|
6835a7f208 | ||
|
f59993b979 | ||
|
2290b8f680 |
541
Grid/algorithms/multigrid/BatchedBlas.h
Normal file
541
Grid/algorithms/multigrid/BatchedBlas.h
Normal file
@ -0,0 +1,541 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: BatchedBlas.h
|
||||||
|
|
||||||
|
Copyright (C) 2023
|
||||||
|
|
||||||
|
Author: Peter Boyle <pboyle@bnl.gov>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
#error // need oneMKL version
|
||||||
|
#endif
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
// Need to rearrange lattice data to be in the right format for a
|
||||||
|
// batched multiply. Might as well make these static, dense packed
|
||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
typedef hipblasHandle_t gridblasHandle_t;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
typedef cudablasHandle_t gridblasHandle_t;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
typedef int32_t gridblasHandle_t;
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
typedef int32_t gridblasHandle_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class GridBLAS {
|
||||||
|
public:
|
||||||
|
|
||||||
|
static gridblasHandle_t gridblasHandle;
|
||||||
|
static int gridblasInit;
|
||||||
|
|
||||||
|
static void Init(void)
|
||||||
|
{
|
||||||
|
if ( ! gridblasInit ) {
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
std::cout << "cublasCreate"<<std::endl;
|
||||||
|
cublasCreate(&gridblasHandle);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
std::cout << "hipblasCreate"<<std::endl;
|
||||||
|
hipblasCreate(&gridblasHandle);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force construct once
|
||||||
|
GridBLAS() { Init(); };
|
||||||
|
~GridBLAS() { };
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// BLAS GEMM conventions:
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// - C = alpha A * B + beta C
|
||||||
|
// Dimensions:
|
||||||
|
// - C_m.n
|
||||||
|
// - A_m.k
|
||||||
|
// - B_k.n
|
||||||
|
// - Flops = 8 M N K
|
||||||
|
// - Bytes = 2*sizeof(word) * (MN+MK+KN)
|
||||||
|
// M=60, N=12
|
||||||
|
// Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void synchronise(void)
|
||||||
|
{
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
auto err = hipDeviceSynchronize();
|
||||||
|
assert(err==hipSuccess);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
auto err = cudaDeviceSynchronize();
|
||||||
|
assert(err==cudaSuccess);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
accelerator_barrier();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
|
||||||
|
{
|
||||||
|
int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
|
||||||
|
int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
|
||||||
|
int32_t N_C = nbasis*nrhs*coarseVol*nstencil;
|
||||||
|
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
||||||
|
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
||||||
|
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
||||||
|
ComplexD alpha(1.0);
|
||||||
|
ComplexD beta (1.0);
|
||||||
|
for(int i=0;i<10;i++){
|
||||||
|
RealD t0 = usecond();
|
||||||
|
for(int s=0;s<nstencil;s++){
|
||||||
|
gemmStridedBatched(nbasis,nrhs,nbasis,
|
||||||
|
alpha,
|
||||||
|
&A[0], // m x k
|
||||||
|
&B[0], // k x n
|
||||||
|
beta,
|
||||||
|
&C[0], // m x n
|
||||||
|
coarseVol);
|
||||||
|
}
|
||||||
|
synchronise();
|
||||||
|
RealD t1 = usecond();
|
||||||
|
RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
|
||||||
|
std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
ComplexD alpha,
|
||||||
|
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<ComplexD*> &Bkn,
|
||||||
|
ComplexD beta,
|
||||||
|
deviceVector<ComplexD*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
static deviceVector<ComplexD> alpha_p(1);
|
||||||
|
static deviceVector<ComplexD> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||||
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
auto err = hipblasZgemmBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(hipblasDoubleComplex *) &alpha_p[0],
|
||||||
|
(hipblasDoubleComplex **)&Amk[0], lda,
|
||||||
|
(hipblasDoubleComplex **)&Bkn[0], ldb,
|
||||||
|
(hipblasDoubleComplex *) &beta_p[0],
|
||||||
|
(hipblasDoubleComplex **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
// std::cout << " hipblas return code " <<(int)err<<std::endl;
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
auto err = cublasZgemmBatched(gridblasHandle,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(cuDoubleComplex *) &alpha_p[0],
|
||||||
|
(cuDoubleComplex **)&Amk[0], lda,
|
||||||
|
(cuDoubleComplex **)&Bkn[0], ldb,
|
||||||
|
(cuDoubleComplex *) &beta_p[0],
|
||||||
|
(cuDoubleComplex **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||||
|
#warning "oneMKL implementation not built "
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation
|
||||||
|
for (int p = 0; p < batchCount; ++p) {
|
||||||
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
|
ComplexD c_mn(0.0);
|
||||||
|
for (int kk = 0; kk < k, ++kk)
|
||||||
|
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||||
|
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
synchronise();
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
ComplexF alpha,
|
||||||
|
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<ComplexF*> &Bkn,
|
||||||
|
ComplexF beta,
|
||||||
|
deviceVector<ComplexF*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
static deviceVector<ComplexF> alpha_p(1);
|
||||||
|
static deviceVector<ComplexF> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
||||||
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
auto err = hipblasCgemmBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(hipblasComplex *) &alpha_p[0],
|
||||||
|
(hipblasComplex **)&Amk[0], lda,
|
||||||
|
(hipblasComplex **)&Bkn[0], ldb,
|
||||||
|
(hipblasComplex *) &beta_p[0],
|
||||||
|
(hipblasComplex **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
// std::cout << " hipblas return code " <<(int)err<<std::endl;
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
auto err = cublasCgemmBatched(gridblasHandle,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(cuComplex *) &alpha_p[0],
|
||||||
|
(cuComplex **)&Amk[0], lda,
|
||||||
|
(cuComplex **)&Bkn[0], ldb,
|
||||||
|
(cuComplex *) &beta_p[0],
|
||||||
|
(cuComplex **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||||
|
#warning "oneMKL implementation not built "
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation
|
||||||
|
for (int p = 0; p < batchCount; ++p) {
|
||||||
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
|
ComplexD c_mn(0.0);
|
||||||
|
for (int kk = 0; kk < k, ++kk)
|
||||||
|
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||||
|
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
synchronise();
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Single precision real GEMM
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
RealF alpha,
|
||||||
|
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealF*> &Bkn,
|
||||||
|
RealF beta,
|
||||||
|
deviceVector<RealF*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
static deviceVector<RealF> alpha_p(1);
|
||||||
|
static deviceVector<RealF> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
||||||
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
auto err = hipblasSgemmBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(float **)&Amk[0], lda,
|
||||||
|
(float **)&Bkn[0], ldb,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
auto err = cublasSgemmBatched(gridblasHandle,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(float **)&Amk[0], lda,
|
||||||
|
(float **)&Bkn[0], ldb,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||||
|
#warning "oneMKL implementation not built "
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation
|
||||||
|
for (int p = 0; p < batchCount; ++p) {
|
||||||
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
|
RealD c_mn(0.0);
|
||||||
|
for (int kk = 0; kk < k, ++kk)
|
||||||
|
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||||
|
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
synchronise();
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Double precision real GEMM
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
RealD alpha,
|
||||||
|
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealD*> &Bkn,
|
||||||
|
RealD beta,
|
||||||
|
deviceVector<RealD*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
static deviceVector<RealD> alpha_p(1);
|
||||||
|
static deviceVector<RealD> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
||||||
|
RealD t0=usecond();
|
||||||
|
// std::cout << "hipblasZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
auto err = hipblasDgemmBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
auto err = cublasDgemmBatched(gridblasHandle,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
/*
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
|
||||||
|
onemkl::transpose::N,
|
||||||
|
onemkl::transpose::N,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
1,&batchCount64);
|
||||||
|
*/
|
||||||
|
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||||
|
#warning "oneMKL implementation not built "
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation
|
||||||
|
for (int p = 0; p < batchCount; ++p) {
|
||||||
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
|
RealD c_mn(0.0);
|
||||||
|
for (int kk = 0; kk < k, ++kk)
|
||||||
|
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||||
|
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
synchronise();
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Strided case used by benchmark, but generally unused in Grid
|
||||||
|
// Keep a code example in double complex, but don't generate the single and real variants for now
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void gemmStridedBatched(int m,int n, int k,
|
||||||
|
ComplexD alpha,
|
||||||
|
ComplexD* Amk, // pointer list to matrices
|
||||||
|
ComplexD* Bkn,
|
||||||
|
ComplexD beta,
|
||||||
|
ComplexD* Cmn,
|
||||||
|
int batchCount)
|
||||||
|
{
|
||||||
|
// Use C-row major storage, so transpose calls
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
int sda = m*k;
|
||||||
|
int sdb = k*n;
|
||||||
|
int sdc = m*n;
|
||||||
|
deviceVector<ComplexD> alpha_p(1);
|
||||||
|
deviceVector<ComplexD> beta_p(1);
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||||
|
std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
|
std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
||||||
|
std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(hipblasDoubleComplex *) &alpha_p[0],
|
||||||
|
(hipblasDoubleComplex *) Amk, lda, sda,
|
||||||
|
(hipblasDoubleComplex *) Bkn, ldb, sdb,
|
||||||
|
(hipblasDoubleComplex *) &beta_p[0],
|
||||||
|
(hipblasDoubleComplex *) Cmn, ldc, sdc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasZgemmStridedBatched(gridblasHandle,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
CUBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(cuDoubleComplex *) &alpha_p[0],
|
||||||
|
(cuDoubleComplex *) Amk, lda, sda,
|
||||||
|
(cuDoubleComplex *) Bkn, ldb, sdb,
|
||||||
|
(cuDoubleComplex *) &beta_p[0],
|
||||||
|
(cuDoubleComplex *) Cmn, ldc, sdc,
|
||||||
|
batchCount);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
#warning "oneMKL implementation not made "
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation
|
||||||
|
for (int p = 0; p < batchCount; ++p) {
|
||||||
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
|
ComplexD c_mn(0.0);
|
||||||
|
for (int kk = 0; kk < k, ++kk)
|
||||||
|
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||||
|
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -223,7 +223,7 @@ public:
|
|||||||
text+=usecond();
|
text+=usecond();
|
||||||
ttot+=usecond();
|
ttot+=usecond();
|
||||||
|
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<" of which mult2 "<<tmult2<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<" of which mult2 "<<tmult2<<" us"<<std::endl;
|
||||||
@ -232,8 +232,9 @@ public:
|
|||||||
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||||
// std::cout << GridLogPerformance<<std::endl;
|
// std::cout << GridLogPerformance<<std::endl;
|
||||||
|
std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||||
|
|
||||||
@ -420,11 +421,6 @@ public:
|
|||||||
tinv+=usecond();
|
tinv+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
|
||||||
Coordinate coor({0,0,0,0,0});
|
|
||||||
auto sval = peekSite(_A[p],coor);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only needed if nonhermitian
|
// Only needed if nonhermitian
|
||||||
if ( ! hermitian ) {
|
if ( ! hermitian ) {
|
||||||
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
|
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
|
||||||
|
@ -27,8 +27,26 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Grid/algorithms/multigrid/BatchedBlas.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
// Move this to accelerator.h
|
||||||
|
// Also give a copy device.
|
||||||
|
// Rename acceleratorPut
|
||||||
|
// Rename acceleratorGet
|
||||||
|
template<class T> void deviceSet(T& dev,T&host)
|
||||||
|
{
|
||||||
|
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||||
|
}
|
||||||
|
template<class T> T deviceGet(T& dev)
|
||||||
|
{
|
||||||
|
T host;
|
||||||
|
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||||
|
return host;
|
||||||
|
}
|
||||||
|
|
||||||
// Fine Object == (per site) type of fine field
|
// Fine Object == (per site) type of fine field
|
||||||
// nbasis == number of deflation vectors
|
// nbasis == number of deflation vectors
|
||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
@ -40,6 +58,7 @@ public:
|
|||||||
|
|
||||||
typedef iVector<CComplex,nbasis > siteVector;
|
typedef iVector<CComplex,nbasis > siteVector;
|
||||||
typedef iMatrix<CComplex,nbasis > siteMatrix;
|
typedef iMatrix<CComplex,nbasis > siteMatrix;
|
||||||
|
typedef iVector<SComplex,nbasis > calcVector;
|
||||||
typedef iMatrix<SComplex,nbasis > calcMatrix;
|
typedef iMatrix<SComplex,nbasis > calcMatrix;
|
||||||
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
|
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
|
||||||
typedef Lattice<siteVector> CoarseVector;
|
typedef Lattice<siteVector> CoarseVector;
|
||||||
@ -60,9 +79,13 @@ public:
|
|||||||
PaddedCell Cell;
|
PaddedCell Cell;
|
||||||
GeneralLocalStencil Stencil;
|
GeneralLocalStencil Stencil;
|
||||||
|
|
||||||
std::vector<deviceVector<calcMatrix> > _A;
|
deviceVector<calcVector> BLAS_B;
|
||||||
std::vector<CoarseVector> MultTemporaries;
|
deviceVector<calcVector> BLAS_C;
|
||||||
deviceVector<GeneralStencilEntryReordered> StencilMasked;
|
std::vector<deviceVector<calcMatrix> > BLAS_A;
|
||||||
|
|
||||||
|
std::vector<deviceVector<ComplexD *> > BLAS_AP;
|
||||||
|
std::vector<deviceVector<ComplexD *> > BLAS_BP;
|
||||||
|
deviceVector<ComplexD *> BLAS_CP;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -76,58 +99,209 @@ public:
|
|||||||
_CoarseGridMulti(CoarseGridMulti),
|
_CoarseGridMulti(CoarseGridMulti),
|
||||||
geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
|
geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
|
||||||
Cell(Op.geom.Depth(),_CoarseGridMulti),
|
Cell(Op.geom.Depth(),_CoarseGridMulti),
|
||||||
Stencil(Cell.grids.back(),geom.shifts)
|
Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
|
||||||
{
|
{
|
||||||
_A.resize(geom.npoint);
|
|
||||||
int32_t padded_sites = _Op._A[0].Grid()->lSites();
|
int32_t padded_sites = _Op._A[0].Grid()->lSites();
|
||||||
|
int32_t unpadded_sites = _CoarseGrid->lSites();
|
||||||
|
|
||||||
|
int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
|
||||||
|
int32_t orhs = nrhs/CComplex::Nsimd();
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Device data vector storage
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
BLAS_A.resize(geom.npoint);
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
_A[p].resize(padded_sites);
|
BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
|
||||||
}
|
}
|
||||||
std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
|
BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
|
||||||
|
BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
|
||||||
|
|
||||||
StencilMasked.resize(_CoarseGridMulti->oSites()*geom.npoint);
|
BLAS_AP.resize(geom.npoint);
|
||||||
std::vector<GeneralStencilEntryReordered> StencilTmp;
|
BLAS_BP.resize(geom.npoint);
|
||||||
|
for(int p=0;p<geom.npoint;p++){
|
||||||
|
BLAS_AP[p].resize(unpadded_sites);
|
||||||
|
BLAS_BP[p].resize(unpadded_sites);
|
||||||
|
}
|
||||||
|
BLAS_CP.resize(unpadded_sites);
|
||||||
|
|
||||||
int32_t j=0;
|
/////////////////////////////////////////////////
|
||||||
int32_t sites = Stencil._entries.size()/geom.npoint;
|
// Pointers to data
|
||||||
for(int32_t s=0;s<sites;s++){
|
/////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Site identity mapping for A, C
|
||||||
|
for(int p=0;p<geom.npoint;p++){
|
||||||
|
for(int ss=0;ss<unpadded_sites;ss++){
|
||||||
|
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
|
||||||
|
//ComplexD *ptr = (ComplexD *)&BLAS_A[p][0]; std::cout << " A ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_A[p].size()<<std::endl;
|
||||||
|
deviceSet(BLAS_AP[p][ss],ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int ss=0;ss<unpadded_sites;ss++){
|
||||||
|
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
|
||||||
|
//ComplexD *ptr = (ComplexD *)&BLAS_C[0]; std::cout << " C ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_C.size()<<std::endl;
|
||||||
|
deviceSet(BLAS_CP[ss],ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Neighbour table is more complicated
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
int32_t j=0; // Interior point counter (unpadded)
|
||||||
|
for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
|
||||||
int ghost_zone=0;
|
int ghost_zone=0;
|
||||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
int i=s*geom.npoint+point;
|
int i=s*orhs*geom.npoint+point;
|
||||||
if( Stencil._entries[i]._wrap ) {
|
if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
|
||||||
ghost_zone=1;
|
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// std::cout << "site " <<s<<"/"<<sites <<" ghost_zone "<<ghost_zone<<std::endl;
|
// GeneralStencilEntryReordered tmp;
|
||||||
GeneralStencilEntryReordered tmp;
|
|
||||||
if( ghost_zone==0) {
|
if( ghost_zone==0) {
|
||||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
int i=s*geom.npoint+point;
|
int i=s*orhs*geom.npoint+point;
|
||||||
tmp._offset = Stencil._entries[i]._offset;
|
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
|
||||||
tmp._wrap= Stencil._entries[i]._wrap; // Should be no premute and j=site
|
// std::cout << " B ptr "<< nbr<<"/"<<BLAS_B.size()<<std::endl;
|
||||||
tmp._input = s;
|
assert(nbr<BLAS_B.size());
|
||||||
StencilTmp.push_back(tmp);
|
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
|
||||||
|
// ComplexD * ptr = (ComplexD *)&BLAS_B[0];
|
||||||
|
// std::cout << " B ptr unpadded "<<std::hex<<ptr<<std::dec<<" "<<s<<"/"<<padded_sites<<std::endl;
|
||||||
|
// std::cout << " B ptr padded "<<std::hex<<ptr<<std::dec<<" "<<j<<"/"<<unpadded_sites<<std::endl;
|
||||||
|
deviceSet(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
|
||||||
|
// auto tmp = deviceGet(*BLAS_BP[point][j]); // debug trigger SEGV if bad ptr
|
||||||
}
|
}
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << " oSites " << _CoarseGridMulti->oSites()<<std::endl;
|
assert(j==unpadded_sites);
|
||||||
std::cout << " npoint " << geom.npoint<<std::endl;
|
|
||||||
std::cout << " StencilTmp "<<StencilTmp.size()<<std::endl;
|
|
||||||
|
|
||||||
assert(_CoarseGridMulti->oSites()*geom.npoint==StencilTmp.size());
|
|
||||||
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
|
||||||
CopyMatrix();
|
CopyMatrix();
|
||||||
}
|
}
|
||||||
|
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
std::vector<typename vobj::scalar_object> tmp;
|
||||||
|
unvectorizeToLexOrdArray(tmp,from);
|
||||||
|
assert(tmp.size()==from.Grid()->lSites());
|
||||||
|
assert(tmp.size()==to.size());
|
||||||
|
to.resize(tmp.size());
|
||||||
|
acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
|
||||||
|
#else
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
GridBase *Fg = from.Grid();
|
||||||
|
assert(!Fg->_isCheckerBoarded);
|
||||||
|
int nd = Fg->_ndimension;
|
||||||
|
|
||||||
|
to.resize(Fg->lSites());
|
||||||
|
|
||||||
|
Coordinate LocalLatt = Fg->LocalDimensions();
|
||||||
|
size_t nsite = 1;
|
||||||
|
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// do the index calc on the GPU
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
Coordinate f_ostride = Fg->_ostride;
|
||||||
|
Coordinate f_istride = Fg->_istride;
|
||||||
|
Coordinate f_rdimensions = Fg->_rdimensions;
|
||||||
|
|
||||||
|
autoView(from_v,from,AcceleratorRead);
|
||||||
|
auto to_v = &to[0];
|
||||||
|
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
accelerator_for(idx,nsite,1,{
|
||||||
|
|
||||||
|
Coordinate from_coor, base;
|
||||||
|
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
|
||||||
|
for(int i=0;i<nd;i++){
|
||||||
|
from_coor[i] = base[i];
|
||||||
|
}
|
||||||
|
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
||||||
|
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
||||||
|
|
||||||
|
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
||||||
|
scalar_type* to = (scalar_type *)&to_v[idx];
|
||||||
|
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp = getlane(from[w], from_lane);
|
||||||
|
to[w] = stmp;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
std::vector<typename vobj::scalar_object> tmp;
|
||||||
|
tmp.resize(in.size());
|
||||||
|
// std::cout << "BLAStoGrid volume " <<tmp.size()<<" "<< grid.Grid()->lSites()<<std::endl;
|
||||||
|
assert(in.size()==grid.Grid()->lSites());
|
||||||
|
acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
|
||||||
|
vectorizeFromLexOrdArray(tmp,grid);
|
||||||
|
#else
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
GridBase *Tg = grid.Grid();
|
||||||
|
assert(!Tg->_isCheckerBoarded);
|
||||||
|
int nd = Tg->_ndimension;
|
||||||
|
|
||||||
|
assert(in.size()==Tg->lSites());
|
||||||
|
|
||||||
|
Coordinate LocalLatt = Tg->LocalDimensions();
|
||||||
|
size_t nsite = 1;
|
||||||
|
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// do the index calc on the GPU
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
Coordinate t_ostride = Tg->_ostride;
|
||||||
|
Coordinate t_istride = Tg->_istride;
|
||||||
|
Coordinate t_rdimensions = Tg->_rdimensions;
|
||||||
|
|
||||||
|
autoView(to_v,grid,AcceleratorWrite);
|
||||||
|
auto from_v = &in[0];
|
||||||
|
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
accelerator_for(idx,nsite,1,{
|
||||||
|
|
||||||
|
Coordinate to_coor, base;
|
||||||
|
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
|
||||||
|
for(int i=0;i<nd;i++){
|
||||||
|
to_coor[i] = base[i];
|
||||||
|
}
|
||||||
|
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
||||||
|
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
||||||
|
|
||||||
|
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||||
|
scalar_type* from = (scalar_type *)&from_v[idx];
|
||||||
|
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp=from[w];
|
||||||
|
putlane(to[w], stmp, to_lane);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
}
|
||||||
void CopyMatrix (void)
|
void CopyMatrix (void)
|
||||||
{
|
{
|
||||||
// Clone "A" to be lexicographic in the physics coords
|
// Clone "A" to be lexicographic in the physics coords
|
||||||
// Use unvectorisetolexordarray
|
// Use unvectorisetolexordarray
|
||||||
// Copy to device
|
// Copy to device
|
||||||
std::vector<calcMatrix> tmp;
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
unvectorizeToLexOrdArray(tmp,_Op._A[p]);
|
//Unpadded
|
||||||
acceleratorCopyToDevice(&tmp[0],&_A[p][0],sizeof(calcMatrix)*tmp.size());
|
auto Aup = _Op.Cell.Extract(_Op._A[p]);
|
||||||
|
// Coordinate coor({0,0,0,0,0});
|
||||||
|
// auto sval = peekSite(Aup,coor);
|
||||||
|
// std::cout << "CopyMatrix: p "<<p<<" Aup[0] :"<<sval<<std::endl;
|
||||||
|
// sval = peekSite(_Op._A[p],coor);
|
||||||
|
// std::cout << "CopyMatrix: p "<<p<<" _Op._Ap[0] :"<<sval<<std::endl;
|
||||||
|
GridtoBLAS(Aup,BLAS_A[p]);
|
||||||
|
// std::cout << "Copy Matrix p "<<p<<" "<< deviceGet(BLAS_A[p][0])<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void Mdag(const CoarseVector &in, CoarseVector &out)
|
void Mdag(const CoarseVector &in, CoarseVector &out)
|
||||||
@ -136,18 +310,23 @@ public:
|
|||||||
}
|
}
|
||||||
void M (const CoarseVector &in, CoarseVector &out)
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
|
std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
|
||||||
RealD tmult2=0;
|
|
||||||
|
|
||||||
ttot=-usecond();
|
|
||||||
conformable(CoarseGrid(),in.Grid());
|
conformable(CoarseGrid(),in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
CoarseVector tin=in;
|
|
||||||
|
|
||||||
texch-=usecond();
|
RealD t_tot;
|
||||||
CoarseVector pin = Cell.ExchangePeriodic(tin);
|
RealD t_exch;
|
||||||
texch+=usecond();
|
RealD t_GtoB;
|
||||||
|
RealD t_BtoG;
|
||||||
|
RealD t_mult;
|
||||||
|
|
||||||
|
t_tot=-usecond();
|
||||||
|
CoarseVector tin=in;
|
||||||
|
t_exch=-usecond();
|
||||||
|
CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
|
||||||
|
t_exch+=usecond();
|
||||||
|
|
||||||
CoarseVector pout(pin.Grid());
|
CoarseVector pout(pin.Grid());
|
||||||
|
|
||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
@ -157,192 +336,61 @@ public:
|
|||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
|
||||||
RealD flops,bytes;
|
RealD flops,bytes;
|
||||||
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd;
|
int64_t osites=in.Grid()->oSites(); // unpadded
|
||||||
|
int64_t unpadded_vol = _CoarseGrid->lSites();
|
||||||
|
|
||||||
|
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||||
|
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||||
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
|
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
|
||||||
assert(nrhs>=1);
|
assert(nrhs>=1);
|
||||||
|
|
||||||
#if 0
|
std::cout << GridLogMessage << "New Mrhs GridtoBLAS in sizes "<<in.Grid()->lSites()<<" "<<pin.Grid()->lSites()<<std::endl;
|
||||||
{
|
t_GtoB=-usecond();
|
||||||
tviews-=usecond();
|
GridtoBLAS(pin,BLAS_B);
|
||||||
autoView( in_v , pin, AcceleratorRead);
|
// out = Zero();
|
||||||
autoView( out_v , pout, AcceleratorWriteDiscard);
|
// GridtoBLAS(out,BLAS_C);
|
||||||
tviews+=usecond();
|
t_GtoB+=usecond();
|
||||||
|
|
||||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
GridBLAS BLAS;
|
||||||
ttemps-=usecond();
|
|
||||||
MultTemporaries.resize(npoint,in.Grid());
|
|
||||||
ttemps+=usecond();
|
|
||||||
|
|
||||||
std::vector<Aview> AcceleratorViewContainer_h;
|
t_mult=-usecond();
|
||||||
std::vector<Vview> AcceleratorVecViewContainer_h;
|
for(int p=0;p<geom.npoint;p++){
|
||||||
|
RealD c = 1.0;
|
||||||
tviews-=usecond();
|
if (p==0) c = 0.0;
|
||||||
for(int p=0;p<npoint;p++) {
|
ComplexD beta(c);
|
||||||
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
// std::cout << GridLogMessage << "New Mrhs coarse gemmBatched "<<p<<std::endl;
|
||||||
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
BLAS.gemmBatched(nbasis,nrhs,nbasis,
|
||||||
|
ComplexD(1.0),
|
||||||
|
BLAS_AP[p],
|
||||||
|
BLAS_BP[p],
|
||||||
|
ComplexD(c),
|
||||||
|
BLAS_CP);
|
||||||
}
|
}
|
||||||
tviews+=usecond();
|
t_mult+=usecond();
|
||||||
|
// std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
|
||||||
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
t_BtoG=-usecond();
|
||||||
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
BLAStoGrid(out,BLAS_C);
|
||||||
|
t_BtoG+=usecond();
|
||||||
auto Aview_p = &AcceleratorViewContainer[0];
|
t_tot+=usecond();
|
||||||
auto Vview_p = &AcceleratorVecViewContainer[0];
|
// auto check =deviceGet(BLAS_C[0]);
|
||||||
|
// std::cout << "C[0] "<<check<<std::endl;
|
||||||
tcopy-=usecond();
|
// Coordinate coor({0,0,0,0,0,0});
|
||||||
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
// peekLocalSite(check,out,coor);
|
||||||
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
// std::cout << "C[0] "<< check<<std::endl;
|
||||||
tcopy+=usecond();
|
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
|
||||||
int32_t bound = _A[0].size();
|
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
|
||||||
int64_t osites=pin.Grid()->oSites();
|
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
|
||||||
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
|
||||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
|
||||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
std::cout << GridLogMessage<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
|
||||||
// std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
|
||||||
// std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
|
||||||
// std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
|
||||||
tmult-=usecond();
|
std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
|
||||||
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
|
||||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
|
||||||
int32_t ss = rspb/(nbasis*npoint);
|
|
||||||
int32_t bp = rspb%(nbasis*npoint);
|
|
||||||
int32_t point= bp/nbasis;
|
|
||||||
int32_t b = bp%nbasis;
|
|
||||||
assert(ss<bound);
|
|
||||||
auto SE = Stencil_v.GetEntry(point,ss);
|
|
||||||
if ( SE->_permute == 0 ) {
|
|
||||||
int32_t snbr= SE->_offset;
|
|
||||||
auto nbr = coalescedReadGeneralPermute(in_v[snbr],SE->_permute,Nd);
|
|
||||||
auto res = Aview_p[point][ss](0,b)*nbr(0);
|
|
||||||
for(int bb=1;bb<nbasis;bb++) {
|
|
||||||
res = res + Aview_p[point][ss](bb,b)*nbr(bb);
|
|
||||||
}
|
|
||||||
coalescedWrite(Vview_p[point][ss](b),res);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
tmult2-=usecond();
|
|
||||||
accelerator_for(sb, osites*nbasis, Nsimd, {
|
|
||||||
int ss = sb/nbasis;
|
|
||||||
int b = sb%nbasis;
|
|
||||||
auto res = coalescedRead(Vview_p[0][ss](b));
|
|
||||||
for(int point=1;point<npoint;point++){
|
|
||||||
res = res + coalescedRead(Vview_p[point][ss](b));
|
|
||||||
}
|
|
||||||
coalescedWrite(out_v[ss](b),res);
|
|
||||||
});
|
|
||||||
tmult2+=usecond();
|
|
||||||
tmult+=usecond();
|
|
||||||
|
|
||||||
for(int p=0;p<npoint;p++) {
|
|
||||||
AcceleratorVecViewContainer_h[p].ViewClose();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
text-=usecond();
|
|
||||||
out = Cell.Extract(pout);
|
|
||||||
text+=usecond();
|
|
||||||
ttot+=usecond();
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
tviews-=usecond();
|
|
||||||
autoView( in_v , pin, AcceleratorRead);
|
|
||||||
autoView( out_v , out, AcceleratorWriteDiscard);
|
|
||||||
tviews+=usecond();
|
|
||||||
|
|
||||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
|
||||||
ttemps-=usecond();
|
|
||||||
MultTemporaries.resize(npoint,in.Grid());
|
|
||||||
ttemps+=usecond();
|
|
||||||
|
|
||||||
std::vector<Aview> AcceleratorViewContainer_h;
|
|
||||||
std::vector<Vview> AcceleratorVecViewContainer_h;
|
|
||||||
|
|
||||||
tviews-=usecond();
|
|
||||||
for(int p=0;p<npoint;p++) {
|
|
||||||
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
|
||||||
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
|
||||||
}
|
|
||||||
tviews+=usecond();
|
|
||||||
|
|
||||||
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
|
||||||
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
|
||||||
|
|
||||||
auto Aview_p = &AcceleratorViewContainer[0];
|
|
||||||
auto Vview_p = &AcceleratorVecViewContainer[0];
|
|
||||||
|
|
||||||
tcopy-=usecond();
|
|
||||||
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
|
||||||
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
|
||||||
tcopy+=usecond();
|
|
||||||
|
|
||||||
int32_t bound = _A[0].size();
|
|
||||||
int64_t osites=in.Grid()->oSites();
|
|
||||||
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
|
||||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
|
||||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
|
||||||
|
|
||||||
// std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize "<<StencilMasked.size()<<std::endl;
|
|
||||||
// std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
|
||||||
// std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
|
||||||
tmult-=usecond();
|
|
||||||
auto Stencil_v = &StencilMasked[0];
|
|
||||||
accelerator_for(rspb, StencilMasked.size()*nbasis, Nsimd, {
|
|
||||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
|
||||||
int32_t ss = rspb/(nbasis*npoint); // site of unpadded
|
|
||||||
int32_t bp = rspb%(nbasis*npoint);
|
|
||||||
int32_t point= bp/nbasis;
|
|
||||||
int32_t b = bp%nbasis;
|
|
||||||
auto SE = &Stencil_v[ss*npoint+point];
|
|
||||||
int32_t s = SE->_input; // site of padded
|
|
||||||
int32_t snbr= SE->_offset;
|
|
||||||
auto nbr = coalescedRead(in_v[snbr]);
|
|
||||||
auto res = Aview_p[point][s](0,b)*nbr(0);
|
|
||||||
for(int bb=1;bb<nbasis;bb++) {
|
|
||||||
res = res + Aview_p[point][s](bb,b)*nbr(bb);
|
|
||||||
}
|
|
||||||
// std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<" "<< innerProduct(res,res) <<std::endl;
|
|
||||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" res "<< innerProduct(res,res) <<std::endl;
|
|
||||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbrIP "<< innerProduct(nbr,nbr) <<std::endl;
|
|
||||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< nbr <<std::endl;
|
|
||||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< in_v[snbr] <<std::endl;
|
|
||||||
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" A "<< innerProduct(Aview_p[point][s],Aview_p[point][s]) <<std::endl;
|
|
||||||
coalescedWrite(Vview_p[point][ss](b),res);
|
|
||||||
});
|
|
||||||
tmult2-=usecond();
|
|
||||||
accelerator_for(sb, osites*nbasis, Nsimd, {
|
|
||||||
int ss = sb/nbasis;
|
|
||||||
int b = sb%nbasis;
|
|
||||||
auto res = coalescedRead(Vview_p[0][ss](b));
|
|
||||||
for(int point=1;point<npoint;point++){
|
|
||||||
res = res + coalescedRead(Vview_p[point][ss](b));
|
|
||||||
}
|
|
||||||
coalescedWrite(out_v[ss](b),res);
|
|
||||||
});
|
|
||||||
tmult2+=usecond();
|
|
||||||
tmult+=usecond();
|
|
||||||
for(int p=0;p<npoint;p++) {
|
|
||||||
AcceleratorVecViewContainer_h[p].ViewClose();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ttot+=usecond();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<" of which mult2 "<<tmult2<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
|
||||||
// std::cout << GridLogMessage<<std::endl;
|
|
||||||
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
|
||||||
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
|
||||||
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
|
||||||
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
|
||||||
|
|
||||||
};
|
};
|
||||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
||||||
|
@ -29,6 +29,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
|||||||
|
|
||||||
#include <Grid/algorithms/multigrid/Aggregates.h>
|
#include <Grid/algorithms/multigrid/Aggregates.h>
|
||||||
#include <Grid/algorithms/multigrid/Geometry.h>
|
#include <Grid/algorithms/multigrid/Geometry.h>
|
||||||
|
#include <Grid/algorithms/multigrid/BatchedBlas.h>
|
||||||
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
|
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
|
||||||
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
|
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
|
||||||
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
|
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
|
||||||
|
@ -745,6 +745,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// the checks should guarantee that the operations are local
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
GridBase *Fg = From.Grid();
|
GridBase *Fg = From.Grid();
|
||||||
GridBase *Tg = To.Grid();
|
GridBase *Tg = To.Grid();
|
||||||
assert(!Fg->_isCheckerBoarded);
|
assert(!Fg->_isCheckerBoarded);
|
||||||
@ -758,41 +761,12 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
assert(Fg->_processors[d] == Tg->_processors[d]);
|
assert(Fg->_processors[d] == Tg->_processors[d]);
|
||||||
}
|
}
|
||||||
// the above should guarantee that the operations are local
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
size_t nsite = 1;
|
size_t nsite = 1;
|
||||||
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
||||||
|
|
||||||
size_t tbytes = 4*nsite*sizeof(int);
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
int *table = (int*)malloc(tbytes);
|
// do the index calc on the GPU
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
RealD t_cpu=-usecond();
|
|
||||||
#if 0
|
|
||||||
thread_for(idx, nsite, {
|
|
||||||
Coordinate from_coor, to_coor;
|
|
||||||
size_t rem = idx;
|
|
||||||
for(int i=0;i<nd;i++){
|
|
||||||
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
|
||||||
from_coor[i] = base_i + FromLowerLeft[i];
|
|
||||||
to_coor[i] = base_i + ToLowerLeft[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
int foidx = Fg->oIndex(from_coor);
|
|
||||||
int fiidx = Fg->iIndex(from_coor);
|
|
||||||
int toidx = Tg->oIndex(to_coor);
|
|
||||||
int tiidx = Tg->iIndex(to_coor);
|
|
||||||
int* tt = table + 4*idx;
|
|
||||||
tt[0] = foidx;
|
|
||||||
tt[1] = fiidx;
|
|
||||||
tt[2] = toidx;
|
|
||||||
tt[3] = tiidx;
|
|
||||||
});
|
|
||||||
|
|
||||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
|
||||||
acceleratorCopyToDevice(table,table_d,tbytes);
|
|
||||||
#else
|
|
||||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
|
||||||
Coordinate f_ostride = Fg->_ostride;
|
Coordinate f_ostride = Fg->_ostride;
|
||||||
Coordinate f_istride = Fg->_istride;
|
Coordinate f_istride = Fg->_istride;
|
||||||
Coordinate f_rdimensions = Fg->_rdimensions;
|
Coordinate f_rdimensions = Fg->_rdimensions;
|
||||||
@ -800,105 +774,35 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
Coordinate t_istride = Tg->_istride;
|
Coordinate t_istride = Tg->_istride;
|
||||||
Coordinate t_rdimensions = Tg->_rdimensions;
|
Coordinate t_rdimensions = Tg->_rdimensions;
|
||||||
|
|
||||||
accelerator_for(idx, nsite, 1, {
|
|
||||||
Coordinate from_coor, to_coor;
|
|
||||||
size_t rem = idx;
|
|
||||||
for(int i=0;i<nd;i++){
|
|
||||||
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
|
||||||
from_coor[i] = base_i + FromLowerLeft[i];
|
|
||||||
to_coor[i] = base_i + ToLowerLeft[i];
|
|
||||||
}
|
|
||||||
int foidx = 0; for(int d=0;d<nd;d++) foidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
|
||||||
int fiidx = 0; for(int d=0;d<nd;d++) fiidx+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
|
||||||
int toidx = 0; for(int d=0;d<nd;d++) toidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
|
||||||
int tiidx = 0; for(int d=0;d<nd;d++) tiidx+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
|
||||||
int* tt = table_d + 4*idx;
|
|
||||||
tt[0] = foidx;
|
|
||||||
tt[1] = fiidx;
|
|
||||||
tt[2] = toidx;
|
|
||||||
tt[3] = tiidx;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
t_cpu+=usecond();
|
|
||||||
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
autoView(from_v,From,AcceleratorRead);
|
autoView(from_v,From,AcceleratorRead);
|
||||||
autoView(to_v,To,AcceleratorWrite);
|
autoView(to_v,To,AcceleratorWrite);
|
||||||
RealD t_acc=-usecond();
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
accelerator_for(idx,nsite,words,{
|
accelerator_for(idx,nsite,1,{
|
||||||
int* tt = table_d + 4*idx;
|
|
||||||
int from_oidx = *tt++;
|
Coordinate from_coor, to_coor, base;
|
||||||
int from_lane = *tt++;
|
Lexicographic::CoorFromIndex(base,idx,RegionSize);
|
||||||
int to_oidx = *tt++;
|
for(int i=0;i<nd;i++){
|
||||||
int to_lane = *tt;
|
from_coor[i] = base[i] + FromLowerLeft[i];
|
||||||
|
to_coor[i] = base[i] + ToLowerLeft[i];
|
||||||
|
}
|
||||||
|
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
||||||
|
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
||||||
|
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
||||||
|
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
||||||
|
|
||||||
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
||||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||||
|
|
||||||
scalar_type stmp;
|
scalar_type stmp;
|
||||||
#ifdef GRID_SIMT
|
|
||||||
int w = acceleratorSIMTlane(words);
|
|
||||||
stmp = getlane(from[w], from_lane);
|
|
||||||
putlane(to[w], stmp, to_lane);
|
|
||||||
#else
|
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
stmp = getlane(from[w], from_lane);
|
stmp = getlane(from[w], from_lane);
|
||||||
putlane(to[w], stmp, to_lane);
|
putlane(to[w], stmp, to_lane);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
});
|
});
|
||||||
t_acc+=usecond();
|
|
||||||
// std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
|
|
||||||
// std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
|
|
||||||
acceleratorFreeDevice(table_d);
|
|
||||||
free(table);
|
|
||||||
|
|
||||||
|
|
||||||
#else
|
|
||||||
Coordinate ldf = Fg->_ldimensions;
|
|
||||||
Coordinate rdf = Fg->_rdimensions;
|
|
||||||
Coordinate isf = Fg->_istride;
|
|
||||||
Coordinate osf = Fg->_ostride;
|
|
||||||
Coordinate rdt = Tg->_rdimensions;
|
|
||||||
Coordinate ist = Tg->_istride;
|
|
||||||
Coordinate ost = Tg->_ostride;
|
|
||||||
|
|
||||||
autoView( t_v , To, CpuWrite);
|
|
||||||
autoView( f_v , From, CpuRead);
|
|
||||||
thread_for(idx,Fg->lSites(),{
|
|
||||||
sobj s;
|
|
||||||
Coordinate Fcoor(nd);
|
|
||||||
Coordinate Tcoor(nd);
|
|
||||||
Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
|
|
||||||
int in_region=1;
|
|
||||||
for(int d=0;d<nd;d++){
|
|
||||||
if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){
|
|
||||||
in_region=0;
|
|
||||||
}
|
|
||||||
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
|
|
||||||
}
|
|
||||||
if (in_region) {
|
|
||||||
#if 0
|
|
||||||
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
|
|
||||||
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
|
|
||||||
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
|
|
||||||
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
|
|
||||||
scalar_type * fp = (scalar_type *)&f_v[odx_f];
|
|
||||||
scalar_type * tp = (scalar_type *)&t_v[odx_t];
|
|
||||||
for(int w=0;w<words;w++){
|
|
||||||
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
peekLocalSite(s,f_v,Fcoor);
|
|
||||||
pokeLocalSite(s,t_v,Tcoor);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -990,7 +894,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//FIXME: make this run entirely on GPU
|
||||||
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
|
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
|
||||||
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
|
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
|
@ -91,7 +91,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
|||||||
//for cross platform
|
//for cross platform
|
||||||
// FIXME -- can put internal indices into thread loop
|
// FIXME -- can put internal indices into thread loop
|
||||||
auto buf_p = & buf[0];
|
auto buf_p = & buf[0];
|
||||||
autoView(lat_v, lat, AcceleratorRead);
|
autoView(lat_v, lat, AcceleratorWrite);
|
||||||
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||||
|
|
||||||
// scalar layout won't coalesce
|
// scalar layout won't coalesce
|
||||||
@ -329,8 +329,6 @@ public:
|
|||||||
if(dim==0) conformable(old_grid,unpadded_grid);
|
if(dim==0) conformable(old_grid,unpadded_grid);
|
||||||
else conformable(old_grid,grids[dim-1]);
|
else conformable(old_grid,grids[dim-1]);
|
||||||
|
|
||||||
// std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
|
|
||||||
|
|
||||||
double tins=0, tshift=0;
|
double tins=0, tshift=0;
|
||||||
|
|
||||||
int islocal = 0 ;
|
int islocal = 0 ;
|
||||||
@ -339,6 +337,7 @@ public:
|
|||||||
if ( islocal ) {
|
if ( islocal ) {
|
||||||
|
|
||||||
// replace with a copy and maybe grid swizzle
|
// replace with a copy and maybe grid swizzle
|
||||||
|
// return in;??
|
||||||
double t = usecond();
|
double t = usecond();
|
||||||
padded = in;
|
padded = in;
|
||||||
tins += usecond() - t;
|
tins += usecond() - t;
|
||||||
@ -396,7 +395,7 @@ public:
|
|||||||
GridBase *old_grid = in.Grid();
|
GridBase *old_grid = in.Grid();
|
||||||
GridCartesian *new_grid = grids[dim];//These are new grids
|
GridCartesian *new_grid = grids[dim];//These are new grids
|
||||||
Lattice<vobj> padded(new_grid);
|
Lattice<vobj> padded(new_grid);
|
||||||
Lattice<vobj> shifted(old_grid);
|
// Lattice<vobj> shifted(old_grid);
|
||||||
Coordinate local =old_grid->LocalDimensions();
|
Coordinate local =old_grid->LocalDimensions();
|
||||||
Coordinate plocal =new_grid->LocalDimensions();
|
Coordinate plocal =new_grid->LocalDimensions();
|
||||||
if(dim==0) conformable(old_grid,unpadded_grid);
|
if(dim==0) conformable(old_grid,unpadded_grid);
|
||||||
@ -409,14 +408,10 @@ public:
|
|||||||
if ( processors[dim] == 1 ) islocal = 1;
|
if ( processors[dim] == 1 ) islocal = 1;
|
||||||
|
|
||||||
if ( islocal ) {
|
if ( islocal ) {
|
||||||
|
padded=in; // slightly different interface could avoid a copy operation
|
||||||
// replace with a copy and maybe grid swizzle
|
|
||||||
double t = usecond();
|
|
||||||
padded = in;
|
|
||||||
tins += usecond() - t;
|
|
||||||
// return in; ?
|
|
||||||
} else {
|
} else {
|
||||||
Face_exchange(in,padded,dim,depth);
|
Face_exchange(in,padded,dim,depth);
|
||||||
|
return padded;
|
||||||
}
|
}
|
||||||
return padded;
|
return padded;
|
||||||
}
|
}
|
||||||
@ -527,8 +522,6 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// Scatter all faces
|
// Scatter all faces
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
|
|
||||||
|
|
||||||
plane=0;
|
plane=0;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
@ -550,18 +543,16 @@ public:
|
|||||||
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
|
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
|
||||||
}
|
}
|
||||||
t_scatter+= usecond() - t;
|
t_scatter+= usecond() - t;
|
||||||
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
|
|
||||||
t_tot+=usecond();
|
t_tot+=usecond();
|
||||||
|
|
||||||
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
||||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -90,7 +90,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
|||||||
|
|
||||||
for (int i = 0; i < logstreams.size(); i++) {
|
for (int i = 0; i < logstreams.size(); i++) {
|
||||||
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
|
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
|
||||||
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(0);
|
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
|
||||||
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
|
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
|
||||||
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
|
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
|
||||||
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
|
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
|
||||||
|
@ -6,7 +6,6 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--enable-tracing=timer \
|
--enable-tracing=timer \
|
||||||
--enable-accelerator=hip \
|
--enable-accelerator=hip \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-tracing=roctx \
|
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
@ -17,7 +16,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
|
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,6 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
|
gridblasHandle_t GridBLAS::gridblasHandle;
|
||||||
|
int GridBLAS::gridblasInit;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Tells little dirac op to use MdagM as the .Op()
|
// Tells little dirac op to use MdagM as the .Op()
|
||||||
///////////////////////
|
///////////////////////
|
||||||
@ -107,7 +110,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
const int nbasis = 8;
|
const int nbasis = 62;
|
||||||
const int cb = 0 ;
|
const int cb = 0 ;
|
||||||
LatticeFermion prom(FGrid);
|
LatticeFermion prom(FGrid);
|
||||||
|
|
||||||
@ -136,13 +139,12 @@ int main (int argc, char ** argv)
|
|||||||
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
|
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
|
||||||
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
||||||
|
|
||||||
NextToNearestStencilGeometry5D geom(Coarse5d);
|
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
|
||||||
LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse5d);
|
LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse5d);
|
||||||
LittleDiracOperator LittleDiracOpCol(geom,FGrid,Coarse5d);
|
LittleDiracOperator LittleDiracOpCol(geom,FGrid,Coarse5d);
|
||||||
|
|
||||||
HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);
|
HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);
|
||||||
|
|
||||||
int pp=16;
|
|
||||||
LittleDiracOp.CoarsenOperator(HOA,Aggregates);
|
LittleDiracOp.CoarsenOperator(HOA,Aggregates);
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
@ -229,14 +231,16 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Create a higher dim coarse grid
|
// Create a higher dim coarse grid
|
||||||
const int nrhs=vComplex::Nsimd();
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
const int nrhs=vComplex::Nsimd()*3;
|
||||||
|
|
||||||
Coordinate mpi=GridDefaultMpi();
|
Coordinate mpi=GridDefaultMpi();
|
||||||
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
||||||
Coordinate rhLatt({nrhs,1,clatt[0],clatt[2],clatt[2],clatt[3]});
|
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
|
||||||
Coordinate rhSimd({nrhs,1, 1,1,1,1});
|
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
|
||||||
|
|
||||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||||
|
|
||||||
@ -248,31 +252,48 @@ int main (int argc, char ** argv)
|
|||||||
CoarseVector rh_res(CoarseMrhs);
|
CoarseVector rh_res(CoarseMrhs);
|
||||||
random(rh_CRNG,rh_phi);
|
random(rh_CRNG,rh_phi);
|
||||||
|
|
||||||
|
std::cout << "Warmup"<<std::endl;
|
||||||
mrhs.M(rh_phi,rh_res);
|
mrhs.M(rh_phi,rh_res);
|
||||||
const int ncall=100;
|
const int ncall=5;
|
||||||
RealD t0=-usecond();
|
RealD t0=-usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
|
std::cout << "Call "<<i<<"/"<<ncall<<std::endl;
|
||||||
mrhs.M(rh_phi,rh_res);
|
mrhs.M(rh_phi,rh_res);
|
||||||
}
|
}
|
||||||
t0+=usecond();
|
t0+=usecond();
|
||||||
RealD t1=0;
|
RealD t1=0;
|
||||||
for(int r=0;r<nrhs;r++){
|
for(int r=0;r<nrhs;r++){
|
||||||
|
std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
|
||||||
ExtractSlice(phi,rh_phi,r,0);
|
ExtractSlice(phi,rh_phi,r,0);
|
||||||
ExtractSlice(chi,rh_res,r,0);
|
ExtractSlice(chi,rh_res,r,0);
|
||||||
LittleDiracOp.M(phi,Aphi);
|
LittleDiracOp.M(phi,Aphi);
|
||||||
t1-=usecond();
|
t1-=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
|
std::cout << "Call "<<i<<"/"<<ncall<<std::endl;
|
||||||
LittleDiracOp.M(phi,Aphi);
|
LittleDiracOp.M(phi,Aphi);
|
||||||
}
|
}
|
||||||
t1+=usecond();
|
t1+=usecond();
|
||||||
|
Coordinate site({0,0,0,0,0});
|
||||||
|
auto bad = peekSite(chi,site);
|
||||||
|
auto good = peekSite(Aphi,site);
|
||||||
std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
|
std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
|
||||||
std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
|
std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
|
||||||
chi=chi-Aphi;
|
chi=chi-Aphi;
|
||||||
std::cout << r << " diff " << norm2(chi)<<std::endl;
|
RealD diff =norm2(chi);
|
||||||
|
std::cout << r << " diff " << diff<<std::endl;
|
||||||
|
assert(diff < 1.0e-10);
|
||||||
}
|
}
|
||||||
std::cout << nrhs<< " mrhs " << t0/ncall/nrhs <<" us"<<std::endl;
|
std::cout << nrhs<< " mrhs " << t0/ncall/nrhs <<" us"<<std::endl;
|
||||||
std::cout << nrhs<< " srhs " << t1/ncall/nrhs <<" us"<<std::endl;
|
std::cout << nrhs<< " srhs " << t1/ncall/nrhs <<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user