mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
14 Commits
b461184797
...
557fa483ff
Author | SHA1 | Date | |
---|---|---|---|
|
557fa483ff | ||
|
fc15d55df6 | ||
|
53573d7d94 | ||
|
bb3c177000 | ||
|
a3322b470f | ||
|
f8f408e7a9 | ||
|
baac1127d0 | ||
|
6f1328160c | ||
|
04cf902791 | ||
|
7a5b1c1a19 | ||
|
18d2d7da4a | ||
|
6ae52da571 | ||
|
4ee9c68053 | ||
|
a15b4378a3 |
1052
BLAS_benchmark/BatchBlasBench.cc
Normal file
1052
BLAS_benchmark/BatchBlasBench.cc
Normal file
File diff suppressed because it is too large
Load Diff
2
BLAS_benchmark/compile-command
Normal file
2
BLAS_benchmark/compile-command
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
@ -208,6 +208,9 @@ public:
|
|||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -267,7 +270,6 @@ public:
|
|||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
std::cerr << " Calling SYCL batched ZGEMM "<<std::endl;
|
|
||||||
int64_t m64=m;
|
int64_t m64=m;
|
||||||
int64_t n64=n;
|
int64_t n64=n;
|
||||||
int64_t k64=k;
|
int64_t k64=k;
|
||||||
@ -275,10 +277,20 @@ public:
|
|||||||
int64_t ldb64=ldb;
|
int64_t ldb64=ldb;
|
||||||
int64_t ldc64=ldc;
|
int64_t ldc64=ldc;
|
||||||
int64_t batchCount64=batchCount;
|
int64_t batchCount64=batchCount;
|
||||||
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
¬ransp,
|
&iOpA,
|
||||||
¬ransp,
|
&iOpB,
|
||||||
&m64,&n64,&k64,
|
&m64,&n64,&k64,
|
||||||
(ComplexD *) &alpha_p[0],
|
(ComplexD *) &alpha_p[0],
|
||||||
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
|
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
|
||||||
@ -287,42 +299,100 @@ public:
|
|||||||
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
|
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
synchronise();
|
synchronise();
|
||||||
std::cerr << " Called SYCL batched ZGEMM "<<std::endl;
|
#if 0
|
||||||
|
// This code was used to check the mat mul on Sunspot/OneMKL
|
||||||
|
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
|
||||||
std::vector<ComplexD> A(m*k); // pointer list to matrices
|
std::vector<ComplexD> A(m*k); // pointer list to matrices
|
||||||
std::vector<ComplexD> B(k*n);
|
std::vector<ComplexD> B(k*n);
|
||||||
std::vector<ComplexD> C(m*n);
|
std::vector<ComplexD> C(m*n);
|
||||||
int sda = lda*k;
|
// int sda = lda*k;
|
||||||
int sdb = ldb*k;
|
// int sdb = ldb*k;
|
||||||
int sdc = ldc*n;
|
// int sdc = ldc*n;
|
||||||
|
std::cerr << " Checking the GEMM results "<<std::endl;
|
||||||
for (int p = 0; p < 1; ++p) {
|
for (int p = 0; p < 1; ++p) {
|
||||||
acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD));
|
ComplexD * Amk_p; // pointer list to matrices
|
||||||
acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD));
|
ComplexD * Bkn_p; // pointer list to matrices
|
||||||
acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD));
|
ComplexD * Cmn_p; // pointer list to matrices
|
||||||
|
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
|
||||||
|
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
|
||||||
|
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
|
||||||
|
std::cerr << " p " << p << " copied pointers "<<std::endl;
|
||||||
|
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
|
||||||
|
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
|
||||||
|
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
|
||||||
|
std::cerr << " p " << p << " copied matrices "<<std::endl;
|
||||||
|
std::cerr << " C[0] "<<C[0]<<std::endl;
|
||||||
|
std::cerr << " A[0] "<<A[0]<<std::endl;
|
||||||
|
std::cerr << " B[0] "<<B[0]<<std::endl;
|
||||||
|
std::cerr << " m "<<m<<std::endl;
|
||||||
|
std::cerr << " n "<<n<<std::endl;
|
||||||
|
std::cerr << " k "<<k<<std::endl;
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
ComplexD c_mn(0.0);
|
ComplexD c_mn(0.0);
|
||||||
for (int kk = 0; kk < k; ++kk)
|
for (int kk = 0; kk < k; ++kk) {
|
||||||
c_mn += A[mm + kk*lda ] * B[kk + nn*ldb];
|
int idx_a, idx_b;
|
||||||
std::cout << " beta "<<beta<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
|
// int lda = m; // m x k column major
|
||||||
|
// int ldb = k; // k x n column major
|
||||||
|
// int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N) {
|
||||||
|
idx_a =kk + mm*lda;
|
||||||
|
} else {
|
||||||
|
idx_a =mm + kk*lda;
|
||||||
|
}
|
||||||
|
if(OpB!=GridBLAS_OP_N) {
|
||||||
|
idx_b =nn + kk*ldb;
|
||||||
|
} else {
|
||||||
|
idx_b =kk + nn*ldb;
|
||||||
|
}
|
||||||
|
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
|
||||||
|
|
||||||
|
ComplexD Ac = A[idx_a];
|
||||||
|
ComplexD Bc = B[idx_b];
|
||||||
|
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
|
||||||
|
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
|
||||||
|
|
||||||
|
c_mn += Ac*Bc;
|
||||||
|
}
|
||||||
|
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
// Need a default/reference implementation
|
// Need a default/reference implementation; use Eigen
|
||||||
int sda = lda*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdb = ldb*k;
|
thread_for (p, batchCount, {
|
||||||
int sdc = ldc*n;
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
ComplexD c_mn(0.0);
|
});
|
||||||
for (int kk = 0; kk < k; ++kk)
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
thread_for (p, batchCount, {
|
||||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
@ -344,6 +414,9 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -411,10 +484,20 @@ public:
|
|||||||
int64_t ldb64=ldb;
|
int64_t ldb64=ldb;
|
||||||
int64_t ldc64=ldc;
|
int64_t ldc64=ldc;
|
||||||
int64_t batchCount64=batchCount;
|
int64_t batchCount64=batchCount;
|
||||||
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
¬ransp,
|
&iOpA,
|
||||||
¬ransp,
|
&iOpB,
|
||||||
&m64,&n64,&k64,
|
&m64,&n64,&k64,
|
||||||
(ComplexF *) &alpha_p[0],
|
(ComplexF *) &alpha_p[0],
|
||||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||||
@ -425,22 +508,38 @@ public:
|
|||||||
synchronise();
|
synchronise();
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
int sda = lda*k;
|
// Need a default/reference implementation; use Eigen
|
||||||
int sdb = ldb*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdc = ldc*n;
|
thread_for (p, batchCount, {
|
||||||
ComplexF alphaf(real(alpha),imag(alpha));
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
ComplexF betaf(real(beta),imag(beta));
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
// Need a default/reference implementation
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
});
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
ComplexF c_mn(0.0);
|
thread_for (p, batchCount, {
|
||||||
for (int kk = 0; kk < k; ++kk)
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
}
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
@ -463,6 +562,9 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -529,10 +631,20 @@ public:
|
|||||||
int64_t ldb64=ldb;
|
int64_t ldb64=ldb;
|
||||||
int64_t ldc64=ldc;
|
int64_t ldc64=ldc;
|
||||||
int64_t batchCount64=batchCount;
|
int64_t batchCount64=batchCount;
|
||||||
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
¬ransp,
|
&iOpA,
|
||||||
¬ransp,
|
&iOpB,
|
||||||
&m64,&n64,&k64,
|
&m64,&n64,&k64,
|
||||||
(float *) &alpha_p[0],
|
(float *) &alpha_p[0],
|
||||||
(const float **)&Amk[0], (const int64_t *)&lda64,
|
(const float **)&Amk[0], (const int64_t *)&lda64,
|
||||||
@ -540,23 +652,41 @@ public:
|
|||||||
(float *) &beta_p[0],
|
(float *) &beta_p[0],
|
||||||
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
synchronise();
|
synchronise();
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
int sda = lda*k;
|
// Need a default/reference implementation; use Eigen
|
||||||
int sdb = ldb*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdc = ldc*n;
|
thread_for (p, batchCount, {
|
||||||
// Need a default/reference implementation
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
RealD c_mn(0.0);
|
});
|
||||||
for (int kk = 0; kk < k; ++kk)
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
thread_for (p, batchCount, {
|
||||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 2.0*m*n*k*batchCount;
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
@ -567,7 +697,6 @@ public:
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Double precision real GEMM
|
// Double precision real GEMM
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
GridBLASOperation_t OpB,
|
GridBLASOperation_t OpB,
|
||||||
int m,int n, int k,
|
int m,int n, int k,
|
||||||
@ -580,6 +709,9 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -647,10 +779,20 @@ public:
|
|||||||
int64_t ldb64=ldb;
|
int64_t ldb64=ldb;
|
||||||
int64_t ldc64=ldc;
|
int64_t ldc64=ldc;
|
||||||
int64_t batchCount64=batchCount;
|
int64_t batchCount64=batchCount;
|
||||||
oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
¬ransp,
|
&iOpA,
|
||||||
¬ransp,
|
&iOpB,
|
||||||
&m64,&n64,&k64,
|
&m64,&n64,&k64,
|
||||||
(double *) &alpha_p[0],
|
(double *) &alpha_p[0],
|
||||||
(const double **)&Amk[0], (const int64_t *)&lda64,
|
(const double **)&Amk[0], (const int64_t *)&lda64,
|
||||||
@ -658,144 +800,96 @@ public:
|
|||||||
(double *) &beta_p[0],
|
(double *) &beta_p[0],
|
||||||
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
synchronise();
|
synchronise();
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
int sda = lda*k;
|
// Need a default/reference implementation; use Eigen
|
||||||
int sdb = ldb*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdc = ldc*n;
|
thread_for (p, batchCount, {
|
||||||
// Need a default/reference implementation
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
RealD c_mn(0.0);
|
});
|
||||||
for (int kk = 0; kk < k; ++kk)
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
thread_for (p, batchCount, {
|
||||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 2.0*m*n*k*batchCount;
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class CComplex>
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Strided case used by benchmark, but generally unused in Grid
|
|
||||||
// Keep a code example in double complex, but don't generate the single and real variants for now
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
void gemmStridedBatched(int m,int n, int k,
|
|
||||||
ComplexD alpha,
|
|
||||||
ComplexD* Amk, // pointer list to matrices
|
|
||||||
ComplexD* Bkn,
|
|
||||||
ComplexD beta,
|
|
||||||
ComplexD* Cmn,
|
|
||||||
int batchCount)
|
|
||||||
{
|
|
||||||
// Use C-row major storage, so transpose calls
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
int sda = m*k;
|
|
||||||
int sdb = k*n;
|
|
||||||
int sdc = m*n;
|
|
||||||
deviceVector<ComplexD> alpha_p(1);
|
|
||||||
deviceVector<ComplexD> beta_p(1);
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
|
||||||
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
|
||||||
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
|
||||||
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
|
||||||
HIPBLAS_OP_N,
|
|
||||||
HIPBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
(hipblasDoubleComplex *) &alpha_p[0],
|
|
||||||
(hipblasDoubleComplex *) Amk, lda, sda,
|
|
||||||
(hipblasDoubleComplex *) Bkn, ldb, sdb,
|
|
||||||
(hipblasDoubleComplex *) &beta_p[0],
|
|
||||||
(hipblasDoubleComplex *) Cmn, ldc, sdc,
|
|
||||||
batchCount);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasZgemmStridedBatched(gridblasHandle,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
(cuDoubleComplex *) &alpha_p[0],
|
|
||||||
(cuDoubleComplex *) Amk, lda, sda,
|
|
||||||
(cuDoubleComplex *) Bkn, ldb, sdb,
|
|
||||||
(cuDoubleComplex *) &beta_p[0],
|
|
||||||
(cuDoubleComplex *) Cmn, ldc, sdc,
|
|
||||||
batchCount);
|
|
||||||
#endif
|
|
||||||
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
|
||||||
oneapi::mkl::transpose::N,
|
|
||||||
oneapi::mkl::transpose::N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
(const ComplexD *)Amk,lda,sda,
|
|
||||||
(const ComplexD *)Bkn,ldb,sdb,
|
|
||||||
beta,
|
|
||||||
(ComplexD *)Cmn,ldc,sdc,
|
|
||||||
batchCount);
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
|
||||||
// Need a default/reference implementation
|
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
|
||||||
ComplexD c_mn(0.0);
|
|
||||||
for (int kk = 0; kk < k; ++kk)
|
|
||||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
|
||||||
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
double benchmark(int M, int N, int K, int BATCH)
|
double benchmark(int M, int N, int K, int BATCH)
|
||||||
{
|
{
|
||||||
int32_t N_A = M*K*BATCH;
|
int32_t N_A = M*K*BATCH;
|
||||||
int32_t N_B = K*N*BATCH;
|
int32_t N_B = K*N*BATCH;
|
||||||
int32_t N_C = M*N*BATCH;
|
int32_t N_C = M*N*BATCH;
|
||||||
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
||||||
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
||||||
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
||||||
ComplexD alpha(1.0);
|
CComplex alpha(1.0);
|
||||||
ComplexD beta (1.0);
|
CComplex beta (1.0);
|
||||||
RealD flops = 8.0*M*N*K*BATCH;
|
RealD flops = 8.0*M*N*K*BATCH;
|
||||||
int ncall=10;
|
int ncall=1000;
|
||||||
|
deviceVector<CComplex *> As(BATCH);
|
||||||
|
deviceVector<CComplex *> Bs(BATCH);
|
||||||
|
deviceVector<CComplex *> Cs(BATCH);
|
||||||
|
for(int b = 0 ; b < BATCH;b++) {
|
||||||
|
CComplex *ptr;
|
||||||
|
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
|
||||||
|
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
|
||||||
|
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warm up call
|
||||||
|
gemmBatched(M,N,K,
|
||||||
|
alpha,
|
||||||
|
As, // m x k
|
||||||
|
Bs, // k x n
|
||||||
|
beta,
|
||||||
|
Cs);
|
||||||
|
synchronise();
|
||||||
|
|
||||||
RealD t0 = usecond();
|
RealD t0 = usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
gemmStridedBatched(M,N,K,
|
gemmBatched(M,N,K,
|
||||||
alpha,
|
alpha,
|
||||||
&A[0], // m x k
|
As, // m x k
|
||||||
&B[0], // k x n
|
Bs, // k x n
|
||||||
beta,
|
beta,
|
||||||
&C[0], // m x n
|
Cs);
|
||||||
BATCH);
|
synchronise();
|
||||||
}
|
}
|
||||||
synchronise();
|
|
||||||
RealD t1 = usecond();
|
RealD t1 = usecond();
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
|
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
|
||||||
flops = 8.0*M*N*K*BATCH*ncall;
|
flops = 8.0*M*N*K*BATCH*ncall;
|
||||||
flops = flops/(t1-t0)/1.e3;
|
flops = flops/(t1-t0)/1.e3;
|
||||||
return flops; // Returns gigaflops
|
return flops; // Returns gigaflops
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -35,7 +35,7 @@ uint64_t total_host;;
|
|||||||
void MemoryManager::DisplayMallinfo(void)
|
void MemoryManager::DisplayMallinfo(void)
|
||||||
{
|
{
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
struct mallinfo mi;
|
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
|
||||||
|
|
||||||
mi = mallinfo();
|
mi = mallinfo();
|
||||||
|
|
||||||
|
@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
// Might make all code paths go this way.
|
||||||
#if 0
|
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
{
|
|
||||||
autoView( left_v , left, AcceleratorRead);
|
|
||||||
autoView( right_v,right, AcceleratorRead);
|
|
||||||
// This code could read coalesce
|
|
||||||
// GPU - SIMT lane compliance...
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto x_l = left_v(ss);
|
|
||||||
auto y_l = right_v(ss);
|
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
deviceVector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
// This is in single precision and fails some tests
|
// This is in single precision and fails some tests
|
||||||
auto anrm = sumD(inner_tmp_v,sites);
|
auto anrm = sumD(inner_tmp_v,sites);
|
||||||
nrm = anrm;
|
nrm = anrm;
|
||||||
|
@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {
|
|||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
|
||||||
|
template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
|
||||||
|
template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
|
||||||
|
#endif
|
||||||
|
@ -261,23 +261,25 @@ public:
|
|||||||
fprintf(FP,"\n\n");
|
fprintf(FP,"\n\n");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class CComplex>
|
||||||
static void BLAS(void)
|
static void BLAS(void)
|
||||||
{
|
{
|
||||||
//int nbasis, int nrhs, int coarseVol
|
//int nbasis, int nrhs, int coarseVol
|
||||||
int basis[] = { 16,32,64 };
|
int basis[] = { 16,32,64 };
|
||||||
int rhs[] = { 8,16,32 };
|
int rhs[] = { 8,12,16 };
|
||||||
int vol = 4*4*4*4;
|
int vol = 8*8*8*8;
|
||||||
|
int blk = 4*4*4*4;
|
||||||
|
|
||||||
GridBLAS blas;
|
GridBLAS blas;
|
||||||
|
|
||||||
|
int fpbits = sizeof(CComplex)*4;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
|
std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
|
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
|
|
||||||
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
|
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);
|
||||||
|
|
||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
@ -285,7 +287,7 @@ public:
|
|||||||
int N=rhs[r];
|
int N=rhs[r];
|
||||||
int K=basis[b];
|
int K=basis[b];
|
||||||
int BATCH=vol;
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,N,K,BATCH);
|
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
|
|
||||||
@ -299,9 +301,9 @@ public:
|
|||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
int M=basis[b];
|
int M=basis[b];
|
||||||
int N=rhs[r];
|
int N=rhs[r];
|
||||||
int K=vol;
|
int K=blk;
|
||||||
int BATCH=vol;
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,N,K,BATCH);
|
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
@ -313,10 +315,10 @@ public:
|
|||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
int M=rhs[r];
|
int M=rhs[r];
|
||||||
int N=vol;
|
int N=blk;
|
||||||
int K=basis[b];
|
int K=basis[b];
|
||||||
int BATCH=vol;
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,N,K,BATCH);
|
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
@ -867,6 +869,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_memory=1;
|
int do_memory=1;
|
||||||
int do_comms =1;
|
int do_comms =1;
|
||||||
int do_blas =1;
|
int do_blas =1;
|
||||||
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24,32});
|
std::vector<int> L_list({8,12,16,24,32});
|
||||||
@ -877,6 +880,7 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<double> staggered;
|
std::vector<double> staggered;
|
||||||
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
|
if (do_dslash){
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
|
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
@ -901,6 +905,7 @@ int main (int argc, char ** argv)
|
|||||||
staggered.push_back(result);
|
staggered.push_back(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
@ -909,8 +914,33 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
int NN=NN_global;
|
int NN=NN_global;
|
||||||
|
if(do_dslash){
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
||||||
|
fprintf(FP,"Per node summary table\n");
|
||||||
|
fprintf(FP,"\n");
|
||||||
|
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
||||||
|
fprintf(FP,"\n");
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
||||||
|
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
||||||
|
}
|
||||||
|
fprintf(FP,"\n");
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
||||||
|
std::cout<<std::setprecision(3);
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if ( do_memory ) {
|
if ( do_memory ) {
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
|
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
|
||||||
@ -918,15 +948,6 @@ int main (int argc, char ** argv)
|
|||||||
Benchmark::Memory();
|
Benchmark::Memory();
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( do_blas ) {
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
Benchmark::BLAS();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( do_su4 ) {
|
if ( do_su4 ) {
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
|
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
|
||||||
@ -941,27 +962,13 @@ int main (int argc, char ** argv)
|
|||||||
Benchmark::Comms();
|
Benchmark::Comms();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( do_blas ) {
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
|
||||||
fprintf(FP,"Per node summary table\n");
|
|
||||||
fprintf(FP,"\n");
|
|
||||||
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
|
||||||
fprintf(FP,"\n");
|
|
||||||
for(int l=0;l<L_list.size();l++){
|
|
||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
|
||||||
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
|
||||||
}
|
|
||||||
fprintf(FP,"\n");
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
|
||||||
std::cout<<std::setprecision(3);
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
Benchmark::BLAS<ComplexD>();
|
||||||
|
Benchmark::BLAS<ComplexF>();
|
||||||
|
}
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
fclose(FP);
|
fclose(FP);
|
||||||
|
23
systems/Aurora/config-command-leak
Normal file
23
systems/Aurora/config-command-leak
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
source ~/spack/share/spack/setup-env.sh
|
||||||
|
spack load c-lime
|
||||||
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
|
export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||||
|
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
../../configure \
|
||||||
|
--enable-debug \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--enable-gen-simd-width=64 \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--with-lime=$CLIME \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--enable-accelerator=sycl \
|
||||||
|
--enable-accelerator-aware-mpi=yes\
|
||||||
|
--enable-unified=no \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXX=icpx \
|
||||||
|
LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
|
||||||
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host -fsycl -fsanitize=leak "
|
||||||
|
|
@ -1,13 +1,25 @@
|
|||||||
source ~/spack/share/spack/setup-env.sh
|
source ~/spack/share/spack/setup-env.sh
|
||||||
spack load c-lime
|
spack load c-lime
|
||||||
|
|
||||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
#spack load libefence
|
||||||
|
#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
|
||||||
|
#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
|
||||||
|
#spack load gperftools
|
||||||
|
export TCMALLOC=/home/paboyle/gperftools/install
|
||||||
|
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
export INTELGT_AUTO_ATTACH_DISABLE=1
|
export INTELGT_AUTO_ATTACH_DISABLE=1
|
||||||
|
|
||||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
||||||
|
#module load oneapi/release/2023.12.15.001
|
||||||
|
#module use /soft/modulefiles
|
||||||
|
#module load intel_compute_runtime/release/agama-devel-682.22
|
||||||
|
|
||||||
|
#export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||||
|
#export FI_CXI_CQ_FILL_PERCENT=20
|
||||||
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
|
||||||
|
|
||||||
|
#
|
||||||
# -ftarget-register-alloc-mode=pvc:default
|
# -ftarget-register-alloc-mode=pvc:default
|
||||||
# -ftarget-register-alloc-mode=pvc:small
|
# -ftarget-register-alloc-mode=pvc:small
|
||||||
# -ftarget-register-alloc-mode=pvc:large
|
# -ftarget-register-alloc-mode=pvc:large
|
||||||
@ -20,4 +32,9 @@ export http_proxy=http://proxy.alcf.anl.gov:3128
|
|||||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||||
|
|
||||||
|
#source ~/spack/share/spack/setup-env.sh
|
||||||
|
#spack load gperftools
|
||||||
|
#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||||
|
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
Loading…
Reference in New Issue
Block a user