mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 04:37:05 +01:00
Compare commits
60 Commits
832fc08809
...
feature/ft
Author | SHA1 | Date | |
---|---|---|---|
bffd30abec | |||
da919949f9 | |||
b12b4fdaff | |||
557fa483ff | |||
fc15d55df6 | |||
53573d7d94 | |||
bb3c177000 | |||
a3322b470f | |||
f8f408e7a9 | |||
baac1127d0 | |||
6f1328160c | |||
04cf902791 | |||
7a5b1c1a19 | |||
18d2d7da4a | |||
b461184797 | |||
4563b39305 | |||
c9d5674d5b | |||
486412635a | |||
8b23a1546a | |||
a901e4e369 | |||
804d9367d4 | |||
41d8adca95 | |||
059e8e5bb0 | |||
b3ee8ded96 | |||
cf3584ad15 | |||
a66973163f | |||
4502a8c8a1 | |||
9c902e4c2d | |||
f3eb36adcf | |||
7c246606c1 | |||
172c75029e | |||
6ae52da571 | |||
4ee9c68053 | |||
a15b4378a3 | |||
89fdd7f8dd | |||
c328be24b7 | |||
a73dc6dbf4 | |||
eee2a2657f | |||
12b8be7cb9 | |||
63c223ea5d | |||
2877fb4a2c | |||
d299c86633 | |||
6ce52092e8 | |||
b5926c1d21 | |||
9563238e9b | |||
fb9b1d76ca | |||
1739146599 | |||
ed20b39ab3 | |||
284fc05f15 | |||
07a07b6fa3 | |||
dc80b08969 | |||
a49a161f8d | |||
a6479ca50f | |||
0e607a55e7 | |||
c4b9f71357 | |||
394e506aea | |||
e19b26341b | |||
cfe1b13225 | |||
890c5ea1cd | |||
a87378d3b6 |
1052
BLAS_benchmark/BatchBlasBench.cc
Normal file
1052
BLAS_benchmark/BatchBlasBench.cc
Normal file
File diff suppressed because it is too large
Load Diff
2
BLAS_benchmark/compile-command
Normal file
2
BLAS_benchmark/compile-command
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
@ -30,9 +30,14 @@ directory
|
||||
|
||||
#include <type_traits>
|
||||
#include <cassert>
|
||||
#include <exception>
|
||||
|
||||
#define NAMESPACE_BEGIN(A) namespace A {
|
||||
#define NAMESPACE_END(A) }
|
||||
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
|
||||
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
|
||||
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
|
||||
|
||||
#define EXCEPTION_CHECK_BEGIN(A) try {
|
||||
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
|
||||
|
||||
|
@ -89,9 +89,10 @@ public:
|
||||
gridblasHandle = theGridAccelerator;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
cl::sycl::cpu_selector selector;
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
gridblasHandle =new sycl::queue (selectedDevice);
|
||||
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
||||
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||
#endif
|
||||
gridblasInit=1;
|
||||
}
|
||||
@ -207,6 +208,9 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
@ -266,26 +270,130 @@ public:
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation
|
||||
int sda = lda*k;
|
||||
int sdb = ldb*k;
|
||||
int sdc = ldc*n;
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexD c_mn(0.0);
|
||||
for (int kk = 0; kk < k; ++kk)
|
||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexD *) &alpha_p[0],
|
||||
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexD *) &beta_p[0],
|
||||
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#if 0
|
||||
// This code was used to check the mat mul on Sunspot/OneMKL
|
||||
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
|
||||
std::vector<ComplexD> A(m*k); // pointer list to matrices
|
||||
std::vector<ComplexD> B(k*n);
|
||||
std::vector<ComplexD> C(m*n);
|
||||
// int sda = lda*k;
|
||||
// int sdb = ldb*k;
|
||||
// int sdc = ldc*n;
|
||||
std::cerr << " Checking the GEMM results "<<std::endl;
|
||||
for (int p = 0; p < 1; ++p) {
|
||||
ComplexD * Amk_p; // pointer list to matrices
|
||||
ComplexD * Bkn_p; // pointer list to matrices
|
||||
ComplexD * Cmn_p; // pointer list to matrices
|
||||
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
|
||||
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
|
||||
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
|
||||
std::cerr << " p " << p << " copied pointers "<<std::endl;
|
||||
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
|
||||
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
|
||||
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
|
||||
std::cerr << " p " << p << " copied matrices "<<std::endl;
|
||||
std::cerr << " C[0] "<<C[0]<<std::endl;
|
||||
std::cerr << " A[0] "<<A[0]<<std::endl;
|
||||
std::cerr << " B[0] "<<B[0]<<std::endl;
|
||||
std::cerr << " m "<<m<<std::endl;
|
||||
std::cerr << " n "<<n<<std::endl;
|
||||
std::cerr << " k "<<k<<std::endl;
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexD c_mn(0.0);
|
||||
for (int kk = 0; kk < k; ++kk) {
|
||||
int idx_a, idx_b;
|
||||
// int lda = m; // m x k column major
|
||||
// int ldb = k; // k x n column major
|
||||
// int ldc = m; // m x b column major
|
||||
if(OpA!=GridBLAS_OP_N) {
|
||||
idx_a =kk + mm*lda;
|
||||
} else {
|
||||
idx_a =mm + kk*lda;
|
||||
}
|
||||
if(OpB!=GridBLAS_OP_N) {
|
||||
idx_b =nn + kk*ldb;
|
||||
} else {
|
||||
idx_b =kk + nn*ldb;
|
||||
}
|
||||
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
|
||||
|
||||
ComplexD Ac = A[idx_a];
|
||||
ComplexD Bc = B[idx_b];
|
||||
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
|
||||
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
|
||||
|
||||
c_mn += Ac*Bc;
|
||||
}
|
||||
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
||||
@ -306,6 +414,9 @@ public:
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
@ -366,26 +477,69 @@ public:
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
int sda = lda*k;
|
||||
int sdb = ldb*k;
|
||||
int sdc = ldc*n;
|
||||
ComplexF alphaf(real(alpha),imag(alpha));
|
||||
ComplexF betaf(real(beta),imag(beta));
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexF c_mn(0.0);
|
||||
for (int kk = 0; kk < k; ++kk)
|
||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
||||
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
|
||||
}
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k*batchCount;
|
||||
@ -408,6 +562,9 @@ public:
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||
assert(OpB!=GridBLAS_OP_C);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
@ -467,24 +624,69 @@ public:
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(float *) &alpha_p[0],
|
||||
(const float **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const float **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(float *) &beta_p[0],
|
||||
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
int sda = lda*k;
|
||||
int sdb = ldb*k;
|
||||
int sdc = ldc*n;
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
RealD c_mn(0.0);
|
||||
for (int kk = 0; kk < k; ++kk)
|
||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
||||
}
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 2.0*m*n*k*batchCount;
|
||||
@ -495,7 +697,6 @@ public:
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Double precision real GEMM
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void gemmBatched(GridBLASOperation_t OpA,
|
||||
GridBLASOperation_t OpB,
|
||||
int m,int n, int k,
|
||||
@ -508,6 +709,9 @@ public:
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||
assert(OpB!=GridBLAS_OP_C);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
@ -568,160 +772,124 @@ public:
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
/*
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
|
||||
onemkl::transpose::N,
|
||||
onemkl::transpose::N,
|
||||
&m64,&n64,&k64,
|
||||
(double *) &alpha_p[0],
|
||||
(double **)&Amk[0], lda,
|
||||
(double **)&Bkn[0], ldb,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], ldc,
|
||||
1,&batchCount64);
|
||||
*/
|
||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
||||
#warning "oneMKL implementation not built "
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(double *) &alpha_p[0],
|
||||
(const double **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const double **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
int sda = lda*k;
|
||||
int sdb = ldb*k;
|
||||
int sdc = ldc*n;
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
RealD c_mn(0.0);
|
||||
for (int kk = 0; kk < k; ++kk)
|
||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
||||
}
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 2.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Strided case used by benchmark, but generally unused in Grid
|
||||
// Keep a code example in double complex, but don't generate the single and real variants for now
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void gemmStridedBatched(int m,int n, int k,
|
||||
ComplexD alpha,
|
||||
ComplexD* Amk, // pointer list to matrices
|
||||
ComplexD* Bkn,
|
||||
ComplexD beta,
|
||||
ComplexD* Cmn,
|
||||
int batchCount)
|
||||
{
|
||||
// Use C-row major storage, so transpose calls
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
int sda = m*k;
|
||||
int sdb = k*n;
|
||||
int sdc = m*n;
|
||||
deviceVector<ComplexD> alpha_p(1);
|
||||
deviceVector<ComplexD> beta_p(1);
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
||||
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(hipblasDoubleComplex *) &alpha_p[0],
|
||||
(hipblasDoubleComplex *) Amk, lda, sda,
|
||||
(hipblasDoubleComplex *) Bkn, ldb, sdb,
|
||||
(hipblasDoubleComplex *) &beta_p[0],
|
||||
(hipblasDoubleComplex *) Cmn, ldc, sdc,
|
||||
batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
cublasZgemmStridedBatched(gridblasHandle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
m,n,k,
|
||||
(cuDoubleComplex *) &alpha_p[0],
|
||||
(cuDoubleComplex *) Amk, lda, sda,
|
||||
(cuDoubleComplex *) Bkn, ldb, sdb,
|
||||
(cuDoubleComplex *) &beta_p[0],
|
||||
(cuDoubleComplex *) Cmn, ldc, sdc,
|
||||
batchCount);
|
||||
#endif
|
||||
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
oneapi::mkl::transpose::N,
|
||||
oneapi::mkl::transpose::N,
|
||||
m,n,k,
|
||||
alpha,
|
||||
(const ComplexD *)Amk,lda,sda,
|
||||
(const ComplexD *)Bkn,ldb,sdb,
|
||||
beta,
|
||||
(ComplexD *)Cmn,ldc,sdc,
|
||||
batchCount);
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||
// Need a default/reference implementation
|
||||
for (int p = 0; p < batchCount; ++p) {
|
||||
for (int mm = 0; mm < m; ++mm) {
|
||||
for (int nn = 0; nn < n; ++nn) {
|
||||
ComplexD c_mn(0.0);
|
||||
for (int kk = 0; kk < k; ++kk)
|
||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
||||
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class CComplex>
|
||||
double benchmark(int M, int N, int K, int BATCH)
|
||||
{
|
||||
int32_t N_A = M*K*BATCH;
|
||||
int32_t N_B = K*N*BATCH;
|
||||
int32_t N_C = M*N*BATCH;
|
||||
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
||||
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
||||
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
||||
ComplexD alpha(1.0);
|
||||
ComplexD beta (1.0);
|
||||
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
||||
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
||||
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
||||
CComplex alpha(1.0);
|
||||
CComplex beta (1.0);
|
||||
RealD flops = 8.0*M*N*K*BATCH;
|
||||
int ncall=10;
|
||||
int ncall=1000;
|
||||
deviceVector<CComplex *> As(BATCH);
|
||||
deviceVector<CComplex *> Bs(BATCH);
|
||||
deviceVector<CComplex *> Cs(BATCH);
|
||||
for(int b = 0 ; b < BATCH;b++) {
|
||||
CComplex *ptr;
|
||||
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
|
||||
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
|
||||
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
|
||||
}
|
||||
|
||||
// Warm up call
|
||||
gemmBatched(M,N,K,
|
||||
alpha,
|
||||
As, // m x k
|
||||
Bs, // k x n
|
||||
beta,
|
||||
Cs);
|
||||
synchronise();
|
||||
|
||||
RealD t0 = usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
gemmStridedBatched(M,N,K,
|
||||
alpha,
|
||||
&A[0], // m x k
|
||||
&B[0], // k x n
|
||||
beta,
|
||||
&C[0], // m x n
|
||||
BATCH);
|
||||
gemmBatched(M,N,K,
|
||||
alpha,
|
||||
As, // m x k
|
||||
Bs, // k x n
|
||||
beta,
|
||||
Cs);
|
||||
synchronise();
|
||||
}
|
||||
synchronise();
|
||||
RealD t1 = usecond();
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
|
||||
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
|
||||
flops = 8.0*M*N*K*BATCH*ncall;
|
||||
flops = flops/(t1-t0)/1.e3;
|
||||
return flops; // Returns gigaflops
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -279,11 +279,11 @@ public:
|
||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
||||
_sort.push(eval2,Nm);
|
||||
// Glog << "#Ritz value before shift: "<< std::endl;
|
||||
Glog << "#Ritz value before shift: "<< std::endl;
|
||||
for(int i=0; i<Nm; ++i){
|
||||
// std::cout.precision(13);
|
||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
std::cout.precision(13);
|
||||
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
@ -297,7 +297,8 @@ public:
|
||||
|
||||
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
|
||||
|
||||
for(int ip=Nk; ip<Nm; ++ip){
|
||||
for(int ip=Nk; ip<Nm; ++ip){
|
||||
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
|
||||
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
|
||||
}
|
||||
|
||||
@ -325,7 +326,7 @@ public:
|
||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||
_sort.push(eval2,Nk);
|
||||
// Glog << "#Ritz value after shift: "<< std::endl;
|
||||
Glog << "#Ritz value after shift: "<< std::endl;
|
||||
for(int i=0; i<Nk; ++i){
|
||||
// std::cout.precision(13);
|
||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
@ -467,10 +468,10 @@ public:
|
||||
|
||||
// set initial vector
|
||||
for (int i=0; i<Nu; ++i) {
|
||||
// Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
|
||||
Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
|
||||
evec[i] = src[i];
|
||||
orthogonalize(evec[i],evec,i);
|
||||
// Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
|
||||
Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
|
||||
}
|
||||
// exit(-43);
|
||||
|
||||
@ -506,11 +507,11 @@ public:
|
||||
Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
|
||||
diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
|
||||
_sort.push(eval2,Nr);
|
||||
// Glog << "#Ritz value: "<< std::endl;
|
||||
Glog << "#Ritz value: "<< std::endl;
|
||||
for(int i=0; i<Nr; ++i){
|
||||
// std::cout.precision(13);
|
||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
std::cout.precision(13);
|
||||
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
}
|
||||
|
||||
// Convergence test
|
||||
@ -570,6 +571,7 @@ public:
|
||||
Glog << fname + " NOT converged ; Summary :\n";
|
||||
} else {
|
||||
Glog << fname + " CONVERGED ; Summary :\n";
|
||||
Nstop = Nconv_guess; // Just take them all
|
||||
// Sort convered eigenpairs.
|
||||
std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
|
||||
|
||||
@ -642,7 +644,7 @@ private:
|
||||
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
||||
k_start +=mrhs;
|
||||
}
|
||||
// Glog << "LinAlg "<< std::endl;
|
||||
Glog << "LinAlg "<< std::endl;
|
||||
|
||||
if (b>0) {
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
@ -676,7 +678,7 @@ private:
|
||||
}
|
||||
w_copy[u] = w[u];
|
||||
}
|
||||
// Glog << "LinAlg done"<< std::endl;
|
||||
Glog << "LinAlg done"<< std::endl;
|
||||
|
||||
// In block version, the steps 6 and 7 in Lanczos construction is
|
||||
// replaced by the QR decomposition of new basis block.
|
||||
@ -689,15 +691,15 @@ private:
|
||||
}
|
||||
|
||||
// re-orthogonalization for numerical stability
|
||||
// Glog << "Gram Schmidt"<< std::endl;
|
||||
Glog << "Gram Schmidt"<< std::endl;
|
||||
orthogonalize(w,Nu,evec,R);
|
||||
// QR part
|
||||
for (int u=1; u<Nu; ++u) {
|
||||
orthogonalize(w[u],w,u);
|
||||
}
|
||||
// Glog << "Gram Schmidt done "<< std::endl;
|
||||
Glog << "Gram Schmidt done "<< std::endl;
|
||||
|
||||
// Glog << "LinAlg "<< std::endl;
|
||||
Glog << "LinAlg "<< std::endl;
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
//for (int v=0; v<Nu; ++v) {
|
||||
for (int v=u; v<Nu; ++v) {
|
||||
@ -714,7 +716,7 @@ private:
|
||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||
}
|
||||
}
|
||||
// Glog << "LinAlg done "<< std::endl;
|
||||
Glog << "LinAlg done "<< std::endl;
|
||||
|
||||
if (b < Nm/Nu-1) {
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
@ -779,7 +781,7 @@ private:
|
||||
|
||||
for ( int u=0; u<Nu; ++u ) {
|
||||
for (int k=0; k<Nk; ++k ) {
|
||||
// Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
|
||||
// Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
|
||||
BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
|
||||
}
|
||||
}
|
||||
@ -933,7 +935,7 @@ if (1){
|
||||
int Nu, int Nb, int Nk, int Nm,
|
||||
Eigen::MatrixXcd& M)
|
||||
{
|
||||
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||
assert( Nk <= Nm );
|
||||
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
||||
@ -951,7 +953,7 @@ if (1){
|
||||
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
||||
}
|
||||
}
|
||||
//Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;
|
||||
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
@ -961,7 +963,7 @@ if (1){
|
||||
int Nu, int Nb, int Nk, int Nm,
|
||||
Eigen::MatrixXcd& M)
|
||||
{
|
||||
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||
assert( Nk <= Nm );
|
||||
|
||||
@ -977,7 +979,7 @@ if (1){
|
||||
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
||||
}
|
||||
}
|
||||
//Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;
|
||||
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||
}
|
||||
|
||||
|
||||
@ -986,7 +988,7 @@ if (1){
|
||||
RealD Dsh,
|
||||
Eigen::MatrixXcd& Qprod)
|
||||
{
|
||||
//Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||
Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
@ -1002,6 +1004,7 @@ if (1){
|
||||
// lower triangular part used to represent series
|
||||
// of Q sequence.
|
||||
|
||||
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||
// equivalent operation of Qprod *= Q
|
||||
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
|
||||
@ -1022,6 +1025,7 @@ if (1){
|
||||
|
||||
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
|
||||
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||
for (int i=0; i<Nm; ++i) {
|
||||
for (int j=0; j<Nm-(Nu+1); ++j) {
|
||||
for (int k=0; k<Nu+1+j; ++k) {
|
||||
@ -1029,6 +1033,7 @@ if (1){
|
||||
}
|
||||
}
|
||||
}
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||
for (int i=0; i<Nm; ++i) {
|
||||
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
||||
for (int k=0; k<Nm; ++k) {
|
||||
@ -1036,6 +1041,7 @@ if (1){
|
||||
}
|
||||
}
|
||||
}
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||
|
||||
//static int ntimes = 2;
|
||||
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
||||
@ -1061,11 +1067,13 @@ if (1){
|
||||
Mtmp(j,i) = conj(Mtmp(i,j));
|
||||
}
|
||||
}
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||
|
||||
for (int i=0; i<Nm; ++i) {
|
||||
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
||||
}
|
||||
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||
M = Mtmp;
|
||||
|
||||
//M = Q.adjoint()*(M*Q);
|
||||
@ -1077,7 +1085,7 @@ if (1){
|
||||
// }
|
||||
//}
|
||||
|
||||
//Glog << "shiftedQRDecompEigen() end" << endl;
|
||||
Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||
}
|
||||
|
||||
void exampleQRDecompEigen(void)
|
||||
|
@ -499,6 +499,87 @@ namespace Grid {
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Site diagonal is identity, left preconditioned by Mee^inv
|
||||
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
|
||||
//
|
||||
// Solve:
|
||||
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
|
||||
//
|
||||
// Old notation e<->o
|
||||
//
|
||||
// Left precon by Moo^-1
|
||||
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
|
||||
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
|
||||
public:
|
||||
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Wrap the usual normal equations Schur trick
|
||||
/////////////////////////////////////////////////////
|
||||
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
|
||||
const bool _solnAsInitGuess = false)
|
||||
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
|
||||
|
||||
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
|
||||
{
|
||||
GridBase *grid = _Matrix.RedBlackGrid();
|
||||
GridBase *fgrid= _Matrix.Grid();
|
||||
|
||||
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
|
||||
|
||||
Field tmp(grid);
|
||||
Field Mtmp(grid);
|
||||
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd ,src_o,src);
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
|
||||
/////////////////////////////////////////////////////
|
||||
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
|
||||
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
|
||||
Mtmp=src_o-Mtmp;
|
||||
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
|
||||
|
||||
// get the right MpcDag
|
||||
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
|
||||
}
|
||||
|
||||
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
|
||||
{
|
||||
GridBase *grid = _Matrix.RedBlackGrid();
|
||||
GridBase *fgrid= _Matrix.Grid();
|
||||
|
||||
Field tmp(grid);
|
||||
Field sol_e(grid);
|
||||
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
|
||||
///////////////////////////////////////////////////
|
||||
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
|
||||
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
|
||||
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
|
||||
|
||||
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
|
||||
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
|
||||
};
|
||||
|
||||
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
|
||||
{
|
||||
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
|
||||
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
|
||||
};
|
||||
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
|
||||
{
|
||||
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
|
||||
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Site diagonal is identity, right preconditioned by Mee^inv
|
||||
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
|
||||
|
@ -54,6 +54,9 @@ public:
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
||||
if ( (_Tp*)ptr == (_Tp *) NULL ) {
|
||||
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
|
||||
}
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
@ -100,6 +103,9 @@ public:
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
||||
if ( (_Tp*)ptr == (_Tp *) NULL ) {
|
||||
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
|
||||
}
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
@ -145,6 +151,9 @@ public:
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
||||
if ( (_Tp*)ptr == (_Tp *) NULL ) {
|
||||
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
|
||||
}
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
|
@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
|
||||
uint64_t total_shared;
|
||||
uint64_t total_device;
|
||||
uint64_t total_host;;
|
||||
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(leak_sanitizer)
|
||||
#define ASAN_LEAK_CHECK
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ASAN_LEAK_CHECK
|
||||
#include <sanitizer/asan_interface.h>
|
||||
#include <sanitizer/common_interface_defs.h>
|
||||
#include <sanitizer/lsan_interface.h>
|
||||
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
|
||||
#else
|
||||
#define LEAK_CHECK(A) { }
|
||||
#endif
|
||||
|
||||
void MemoryManager::DisplayMallinfo(void)
|
||||
{
|
||||
#ifdef __linux__
|
||||
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
|
||||
|
||||
mi = mallinfo();
|
||||
|
||||
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
|
||||
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
|
||||
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
|
||||
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
|
||||
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
|
||||
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
|
||||
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
|
||||
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
|
||||
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
|
||||
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
|
||||
#endif
|
||||
LEAK_CHECK();
|
||||
|
||||
}
|
||||
|
||||
void MemoryManager::PrintBytes(void)
|
||||
{
|
||||
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
||||
@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
|
||||
#ifdef GRID_CUDA
|
||||
cuda_mem();
|
||||
#endif
|
||||
|
||||
DisplayMallinfo();
|
||||
}
|
||||
|
||||
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
|
||||
|
@ -211,6 +211,7 @@ private:
|
||||
#endif
|
||||
|
||||
public:
|
||||
static void DisplayMallinfo(void);
|
||||
static void NotifyDeletion(void * CpuPtr);
|
||||
static void Print(void);
|
||||
static void PrintAll(void);
|
||||
|
@ -91,6 +91,7 @@ public:
|
||||
////////////////////////////////////////////////////////////////
|
||||
virtual int CheckerBoarded(int dim)=0;
|
||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||
virtual int CheckerDim(void){ return 0; };
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
||||
|
@ -60,6 +60,7 @@ public:
|
||||
int _checker_dim;
|
||||
std::vector<int> _checker_board;
|
||||
|
||||
virtual int CheckerDim(void){ return _checker_dim; };
|
||||
virtual int CheckerBoarded(int dim){
|
||||
if( dim==_checker_dim) return 1;
|
||||
else return 0;
|
||||
|
@ -236,11 +236,18 @@ public:
|
||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||
vobj vtmp;
|
||||
vtmp = r;
|
||||
#if 1
|
||||
auto me = View(CpuWrite);
|
||||
thread_for(ss,me.size(),{
|
||||
me[ss]= r;
|
||||
});
|
||||
#else
|
||||
auto me = View(AcceleratorWrite);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto stmp=coalescedRead(vtmp);
|
||||
coalescedWrite(me[ss],stmp);
|
||||
});
|
||||
#endif
|
||||
me.ViewClose();
|
||||
return *this;
|
||||
}
|
||||
|
@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
// Might make all code paths go this way.
|
||||
#if 0
|
||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
{
|
||||
autoView( left_v , left, AcceleratorRead);
|
||||
autoView( right_v,right, AcceleratorRead);
|
||||
// This code could read coalesce
|
||||
// GPU - SIMT lane compliance...
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto x_l = left_v(ss);
|
||||
auto y_l = right_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
|
||||
});
|
||||
}
|
||||
#else
|
||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
deviceVector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
{
|
||||
@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||
});
|
||||
}
|
||||
#endif
|
||||
// This is in single precision and fails some tests
|
||||
auto anrm = sumD(inner_tmp_v,sites);
|
||||
nrm = anrm;
|
||||
@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||
#else
|
||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
deviceVector<inner_t> inner_tmp;
|
||||
inner_tmp.resize(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
|
@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
|
||||
static Vector<sobj> mysum;
|
||||
mysum.resize(1);
|
||||
sobj *mysum_p = & mysum[0];
|
||||
sobj identity; zeroit(identity);
|
||||
mysum[0] = identity;
|
||||
sobj ret ;
|
||||
|
||||
Integer nsimd= vobj::Nsimd();
|
||||
|
||||
|
||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
|
||||
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
|
||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||
@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
ret = mysum[0];
|
||||
free(mysum,*theGridAccelerator);
|
||||
// free(mysum,*theGridAccelerator);
|
||||
sobjD dret; convertType(dret,ret);
|
||||
return dret;
|
||||
}
|
||||
@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word xorResult; xorResult = 0;
|
||||
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
|
||||
static Vector<Word> d_sum;
|
||||
d_sum.resize(1);
|
||||
Word *d_sum_p=&d_sum[0];
|
||||
Word identity; identity=0;
|
||||
d_sum[0] = identity;
|
||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
|
||||
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||
sum ^=vec[index];
|
||||
sum^=vec[index];
|
||||
});
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
Word ret = d_sum[0];
|
||||
free(d_sum,*theGridAccelerator);
|
||||
// free(d_sum,*theGridAccelerator);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
||||
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// remove and insert a half checkerboard
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||
{
|
||||
half.Checkerboard() = cb;
|
||||
|
||||
autoView( half_v, half, CpuWrite);
|
||||
autoView( full_v, full, CpuRead);
|
||||
thread_for(ss, full.Grid()->oSites(),{
|
||||
int cbos;
|
||||
Coordinate coor;
|
||||
full.Grid()->oCoorFromOindex(coor,ss);
|
||||
cbos=half.Grid()->CheckerBoard(coor);
|
||||
|
||||
if (cbos==cb) {
|
||||
int ssh=half.Grid()->oIndex(coor);
|
||||
half_v[ssh] = full_v[ss];
|
||||
}
|
||||
});
|
||||
acceleratorPickCheckerboard(cb,half,full);
|
||||
}
|
||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||
{
|
||||
int cb = half.Checkerboard();
|
||||
autoView( half_v , half, CpuRead);
|
||||
autoView( full_v , full, CpuWrite);
|
||||
thread_for(ss,full.Grid()->oSites(),{
|
||||
|
||||
Coordinate coor;
|
||||
int cbos;
|
||||
|
||||
full.Grid()->oCoorFromOindex(coor,ss);
|
||||
cbos=half.Grid()->CheckerBoard(coor);
|
||||
|
||||
if (cbos==cb) {
|
||||
int ssh=half.Grid()->oIndex(coor);
|
||||
full_v[ss]=half_v[ssh];
|
||||
}
|
||||
});
|
||||
acceleratorSetCheckerboard(full,half);
|
||||
}
|
||||
|
||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
|
||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
|
||||
{
|
||||
half.Checkerboard() = cb;
|
||||
autoView(half_v, half, AcceleratorWrite);
|
||||
@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||
Coordinate ostride_half = half.Grid()->_ostride;
|
||||
int checker_dim_half = half.Grid()->CheckerDim();
|
||||
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||
|
||||
Coordinate coor;
|
||||
@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
||||
}
|
||||
});
|
||||
}
|
||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
|
||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
|
||||
{
|
||||
int cb = half.Checkerboard();
|
||||
autoView(half_v , half, AcceleratorRead);
|
||||
@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
|
||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||
Coordinate ostride_half = half.Grid()->_ostride;
|
||||
int checker_dim_half = half.Grid()->CheckerDim();
|
||||
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||
|
||||
Coordinate coor;
|
||||
|
@ -90,6 +90,7 @@ public:
|
||||
exit(1);
|
||||
}
|
||||
Parameters.StartingType = arg;
|
||||
std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
|
||||
}
|
||||
|
||||
if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
|
||||
@ -97,6 +98,7 @@ public:
|
||||
std::vector<int> ivec(0);
|
||||
GridCmdOptionIntVector(arg, ivec);
|
||||
Parameters.StartTrajectory = ivec[0];
|
||||
std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
|
||||
}
|
||||
|
||||
if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
|
||||
@ -104,6 +106,7 @@ public:
|
||||
std::vector<int> ivec(0);
|
||||
GridCmdOptionIntVector(arg, ivec);
|
||||
Parameters.Trajectories = ivec[0];
|
||||
std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
|
||||
}
|
||||
|
||||
if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
|
||||
@ -111,6 +114,7 @@ public:
|
||||
std::vector<int> ivec(0);
|
||||
GridCmdOptionIntVector(arg, ivec);
|
||||
Parameters.NoMetropolisUntil = ivec[0];
|
||||
std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
|
||||
}
|
||||
if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
|
||||
arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
|
||||
|
@ -137,9 +137,11 @@ public:
|
||||
|
||||
double start_force = usecond();
|
||||
|
||||
MemoryManager::Print();
|
||||
as[level].actions.at(a)->deriv_timer_start();
|
||||
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
|
||||
as[level].actions.at(a)->deriv_timer_stop();
|
||||
MemoryManager::Print();
|
||||
|
||||
auto name = as[level].actions.at(a)->action_name();
|
||||
|
||||
@ -246,7 +248,11 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
virtual ~Integrator() {}
|
||||
virtual ~Integrator()
|
||||
{
|
||||
// Pain in the ass to clean up the Level pointers
|
||||
// Guido's design is at fault as per comment above in constructor
|
||||
}
|
||||
|
||||
virtual std::string integrator_name() = 0;
|
||||
|
||||
@ -460,6 +466,7 @@ public:
|
||||
for (int level = 0; level < as.size(); ++level) {
|
||||
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
|
||||
|
||||
MemoryManager::Print();
|
||||
// get gauge field from the SmearingPolicy and
|
||||
// based on the boolean is_smeared in actionID
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
|
||||
@ -468,6 +475,7 @@ public:
|
||||
as[level].actions.at(actionID)->S_timer_stop();
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
MemoryManager::Print();
|
||||
|
||||
}
|
||||
as[level].apply(S_hireps, Representations, level, H);
|
||||
|
@ -32,7 +32,9 @@ private:
|
||||
// Smear_Stout<Gimpl> *StoutSmearing;
|
||||
// std::vector<GaugeField> SmearedSet;
|
||||
|
||||
GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
|
||||
std::vector<LatticeLorentzComplex> masks;
|
||||
std::vector<int> cbs;
|
||||
|
||||
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
||||
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
||||
@ -147,6 +149,25 @@ private:
|
||||
}
|
||||
pokeLorentz(Fdet, Fdet_pol, nu);
|
||||
}
|
||||
|
||||
void Compute_MpInvJx_dNxxdSy(int cb,
|
||||
const GaugeLinkField &PlaqL,
|
||||
const GaugeLinkField &PlaqR,
|
||||
AdjMatrixField MpInvJx,
|
||||
AdjVectorField &Fdet2 )
|
||||
{
|
||||
GaugeLinkField PlaqLeo(UrbGrid);
|
||||
GaugeLinkField PlaqReo(UrbGrid);
|
||||
AdjMatrixField MpInvJxeo(UrbGrid);
|
||||
AdjVectorField Fdet2eo(UrbGrid);
|
||||
pickCheckerboard(cb,PlaqLeo,PlaqL);
|
||||
pickCheckerboard(cb,PlaqReo,PlaqR);
|
||||
pickCheckerboard(cb,MpInvJxeo,MpInvJx);
|
||||
Fdet2eo.Checkerboard()=cb;
|
||||
Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
|
||||
setCheckerboard(Fdet2,Fdet2eo);
|
||||
}
|
||||
|
||||
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
||||
{
|
||||
GaugeLinkField UtaU(PlaqL.Grid());
|
||||
@ -278,8 +299,9 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Mask the gauge field
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int cb = cbs[smr];
|
||||
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
||||
|
||||
|
||||
Umsk = U;
|
||||
ApplyMask(Umsk,smr);
|
||||
Utmp = peekLorentz(Umsk,mu);
|
||||
@ -442,7 +464,7 @@ public:
|
||||
AdjMatrixField MpInvJx_nu(grid);
|
||||
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
||||
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||
Fdet2_mu=FdetV;
|
||||
Fdet1_mu=Zero();
|
||||
|
||||
@ -499,7 +521,7 @@ public:
|
||||
|
||||
time=-usecond();
|
||||
PlaqR=(-1.0)*PlaqR;
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||
Fdet2_nu = FdetV;
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
||||
@ -520,7 +542,7 @@ public:
|
||||
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_nu = Fdet2_nu+FdetV;
|
||||
|
||||
///////////////// -ve nu /////////////////
|
||||
@ -539,7 +561,7 @@ public:
|
||||
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_nu = Fdet2_nu+FdetV;
|
||||
|
||||
// x==
|
||||
@ -560,7 +582,7 @@ public:
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_nu = Fdet2_nu+FdetV;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
@ -589,7 +611,7 @@ public:
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
||||
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_mu = Fdet2_mu+FdetV;
|
||||
|
||||
// __
|
||||
@ -609,7 +631,7 @@ public:
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_mu = Fdet2_mu+FdetV;
|
||||
|
||||
}
|
||||
@ -931,6 +953,10 @@ private:
|
||||
public:
|
||||
|
||||
/* Standard constructor */
|
||||
virtual ~SmearedConfigurationMasked()
|
||||
{
|
||||
delete UrbGrid;
|
||||
}
|
||||
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
||||
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
||||
{
|
||||
@ -939,7 +965,6 @@ public:
|
||||
// was resized in base class
|
||||
assert(this->SmearedSet.size()==Nsmear);
|
||||
|
||||
GridRedBlackCartesian * UrbGrid;
|
||||
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
||||
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
||||
LatticeComplex tmp(_UGrid);
|
||||
@ -947,10 +972,11 @@ public:
|
||||
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
||||
|
||||
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
||||
|
||||
int mu= (i/2) %Nd;
|
||||
int cb= (i%2);
|
||||
LatticeComplex tmpcb(UrbGrid);
|
||||
|
||||
cbs.push_back(cb);
|
||||
|
||||
masks[i]=Zero();
|
||||
////////////////////
|
||||
@ -962,7 +988,6 @@ public:
|
||||
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
||||
|
||||
}
|
||||
delete UrbGrid;
|
||||
}
|
||||
|
||||
virtual void smeared_force(GaugeField &SigmaTilde)
|
||||
|
@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
|
||||
int hNNm1= NNm1/2;
|
||||
RealD sqrt_2 = sqrt(2.0);
|
||||
Complex ci(0.0,1.0);
|
||||
for(int su2Index=0;su2Index<hNNm1;su2Index++){
|
||||
int i1, i2;
|
||||
su2SubGroupIndex(i1, i2, su2Index);
|
||||
int ax = su2Index*2;
|
||||
int ay = su2Index*2+1;
|
||||
accelerator_for(ss,grid->oSites(),1,{
|
||||
|
||||
const int nsimd= Matrix::Nsimd();
|
||||
accelerator_for(ss,grid->oSites(),nsimd,{
|
||||
for(int su2Index=0;su2Index<hNNm1;su2Index++){
|
||||
int i1, i2;
|
||||
su2SubGroupIndex(i1, i2, su2Index);
|
||||
int ax = su2Index*2;
|
||||
int ay = su2Index*2+1;
|
||||
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
|
||||
// trace( Ta x Ci in)
|
||||
// Bet I need to move to real part with mult by -i
|
||||
out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
|
||||
out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
|
||||
});
|
||||
}
|
||||
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
|
||||
int k = diagIndex + 1; // diagIndex starts from 0
|
||||
int a = NNm1+diagIndex;
|
||||
RealD scale = 1.0/sqrt(2.0*k*(k+1));
|
||||
accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
|
||||
auto tmp = in_v[ss]()()(0,0);
|
||||
coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
|
||||
coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
|
||||
}
|
||||
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
|
||||
int k = diagIndex + 1; // diagIndex starts from 0
|
||||
int a = NNm1+diagIndex;
|
||||
RealD scale = 1.0/sqrt(2.0*k*(k+1));
|
||||
auto tmp = in_v(ss)()()(0,0);
|
||||
for(int i=1;i<k;i++){
|
||||
tmp=tmp+in_v[ss]()()(i,i);
|
||||
tmp=tmp+in_v(ss)()()(i,i);
|
||||
}
|
||||
tmp = tmp - in_v[ss]()()(k,k)*k;
|
||||
out_v[ss]()()(a,b) =imag(tmp) * scale;
|
||||
});
|
||||
}
|
||||
tmp = tmp - in_v(ss)()()(k,k)*k;
|
||||
coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
||||
|
||||
int spare = su2_index;
|
||||
|
@ -99,6 +99,8 @@ using std::log;
|
||||
using std::exp;
|
||||
using std::sin;
|
||||
using std::cos;
|
||||
using std::asin;
|
||||
using std::acos;
|
||||
|
||||
|
||||
accelerator_inline RealF conjugate(const RealF & r){ return r; }
|
||||
|
@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
|
||||
template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
|
||||
template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
|
||||
#endif
|
||||
|
@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
|
||||
|
||||
#endif
|
||||
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
||||
{
|
||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||
acceleratorCopySynchronise();
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// CPU Target - No accelerator just thread instead
|
||||
//////////////////////////////////////////////
|
||||
@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
||||
|
||||
#undef GRID_SIMT
|
||||
|
||||
|
||||
inline void acceleratorMem(void)
|
||||
{
|
||||
/*
|
||||
@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void)
|
||||
return;
|
||||
}
|
||||
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
||||
{
|
||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||
acceleratorCopySynchronise();
|
||||
}
|
||||
|
||||
template<class T> void acceleratorPut(T& dev,T&host)
|
||||
{
|
||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||
|
238
HMC/ComputeWilsonFlow.cc
Normal file
238
HMC/ComputeWilsonFlow.cc
Normal file
@ -0,0 +1,238 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: HMC/ComputeWilsonFlow.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Shuhei Yamamoto <syamamoto@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <string>
|
||||
|
||||
namespace Grid{
|
||||
struct WFParameters: Serializable {
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
|
||||
int, steps,
|
||||
double, step_size,
|
||||
int, meas_interval,
|
||||
double, maxTau, // for the adaptive algorithm
|
||||
int, meas_interval_density,
|
||||
std::string, path);
|
||||
|
||||
|
||||
template <class ReaderClass >
|
||||
WFParameters(Reader<ReaderClass>& Reader){
|
||||
read(Reader, "WilsonFlow", *this);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct ConfParameters: Serializable {
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters,
|
||||
std::string, conf_path,
|
||||
std::string, conf_prefix,
|
||||
std::string, conf_smr_prefix,
|
||||
std::string, rng_prefix,
|
||||
int, StartConfiguration,
|
||||
int, EndConfiguration,
|
||||
int, Skip);
|
||||
|
||||
template <class ReaderClass >
|
||||
ConfParameters(Reader<ReaderClass>& Reader){
|
||||
read(Reader, "Configurations", *this);
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
template <class T> void writeFile(T& in, std::string const fname){
|
||||
#ifdef HAVE_LIME
|
||||
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
|
||||
std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
|
||||
Grid::emptyUserRecord record;
|
||||
Grid::ScidacWriter WR(in.Grid()->IsBoss());
|
||||
WR.open(fname);
|
||||
WR.writeScidacFieldRecord(in,record,0);
|
||||
WR.close();
|
||||
#endif
|
||||
// What is the appropriate way to throw error?
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
using namespace Grid;
|
||||
|
||||
Grid_init(&argc, &argv);
|
||||
GridLogLayout();
|
||||
|
||||
auto latt_size = GridDefaultLatt();
|
||||
auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
|
||||
std::vector<int> seeds({1, 2, 3, 4, 5});
|
||||
GridSerialRNG sRNG;
|
||||
GridParallelRNG pRNG(&Grid);
|
||||
pRNG.SeedFixedIntegers(seeds);
|
||||
|
||||
LatticeGaugeField Umu(&Grid), Uflow(&Grid);
|
||||
|
||||
typedef Grid::XmlReader Serialiser;
|
||||
Serialiser Reader("input.xml", false, "root");
|
||||
WFParameters WFPar(Reader);
|
||||
ConfParameters CPar(Reader);
|
||||
CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix);
|
||||
NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar);
|
||||
|
||||
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
|
||||
|
||||
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
|
||||
|
||||
std::cout << std::setprecision(15);
|
||||
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
|
||||
|
||||
std::string file_pre = WFPar.path;
|
||||
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
|
||||
|
||||
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
|
||||
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
|
||||
|
||||
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
|
||||
typedef typename PeriodicGimplR::ComplexField ComplexField;
|
||||
|
||||
assert(Nd == 4);
|
||||
|
||||
// NOTE:
|
||||
// Ideally, turn the folloing into methods of the appropriate class
|
||||
///////////// Compute Energy Density via Clover Leaf /////////////////////////////////////////////////
|
||||
///// Taken from qcd/smearing/WilsonFlow.h
|
||||
// For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3
|
||||
//E = 1/2 tr( F_munu F_munu )
|
||||
//However as F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
|
||||
//F_01 F_02 F_03 F_12 F_13 F_23
|
||||
GaugeMat F(U.Grid());
|
||||
//LatticeComplexD R(U.Grid());
|
||||
ComplexField R(U.Grid());
|
||||
R = Zero();
|
||||
|
||||
for(int mu=0;mu<3;mu++){
|
||||
for(int nu=mu+1;nu<4;nu++){
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu);
|
||||
R = R + trace(F*F);
|
||||
}
|
||||
}
|
||||
R = (-1.0) * R;
|
||||
|
||||
//// Taken from qcd/utils/WilsonLoops.h
|
||||
|
||||
// Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
|
||||
GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir);
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir);
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir);
|
||||
|
||||
// Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
|
||||
GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir);
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir);
|
||||
WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir);
|
||||
|
||||
double coeff = 8.0/(32.0*M_PI*M_PI);
|
||||
ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
|
||||
//ComplexField qfield Plq(U.Grid());
|
||||
//WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U);
|
||||
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
|
||||
//Plq = coeff * Plq;
|
||||
|
||||
int tau = std::round(t);
|
||||
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||
writeFile(R,efile);
|
||||
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||
writeFile(qfield,tfile);
|
||||
|
||||
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
|
||||
RealD T = real( sum(qfield) );
|
||||
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
|
||||
RealD E0 = real(peekSite(R,scoor));
|
||||
RealD T0 = real(peekSite(qfield,scoor));
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
|
||||
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
|
||||
|
||||
});
|
||||
|
||||
int t=WFPar.maxTau;
|
||||
WF.smear(Uflow, Umu);
|
||||
|
||||
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
|
||||
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
|
||||
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
|
||||
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
|
||||
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
|
||||
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
|
||||
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
|
||||
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
|
||||
|
||||
std::cout<< GridLogMessage << " Admissibility check:\n";
|
||||
const double sp_adm = 0.067; // admissible threshold
|
||||
const double pl_adm = 1.0-sp_adm/Nc;
|
||||
std::cout << GridLogMessage << " (pl_adm =" << pl_adm << ")\n";
|
||||
|
||||
// Need min and reduce min for this function
|
||||
//double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm));
|
||||
double sp_ave = Nc*(1.0-WFlow_plaq);
|
||||
|
||||
//std::cout<< GridLogMessage << " sp_max = " << sp_max <<"\n";
|
||||
std::cout<< GridLogMessage << " sp_ave = " << sp_ave <<"\n";
|
||||
std::cout<< GridLogMessage << " (sp_admissible = "<< sp_adm <<")\n";
|
||||
//std::cout<< GridLogMessage << " sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n";
|
||||
std::cout<< GridLogMessage << " sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n";
|
||||
}
|
||||
Grid_finalize();
|
||||
} // main
|
||||
|
||||
|
||||
/*
|
||||
Input file example
|
||||
|
||||
|
||||
JSON
|
||||
|
||||
{
|
||||
"WilsonFlow":{
|
||||
"steps": 200,
|
||||
"step_size": 0.01,
|
||||
"meas_interval": 50,
|
||||
"maxTau": 2.0
|
||||
},
|
||||
"Configurations":{
|
||||
"conf_prefix": "ckpoint_lat",
|
||||
"rng_prefix": "ckpoint_rng",
|
||||
"StartConfiguration": 3000,
|
||||
"EndConfiguration": 3000,
|
||||
"Skip": 5
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
*/
|
@ -58,7 +58,7 @@ int main(int argc, char **argv) {
|
||||
HMCparameters HMCparams;
|
||||
HMCparams.StartTrajectory = 0;
|
||||
HMCparams.Trajectories = 200;
|
||||
HMCparams.NoMetropolisUntil= 20;
|
||||
HMCparams.NoMetropolisUntil= 0;
|
||||
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
||||
HMCparams.StartingType =std::string("ColdStart");
|
||||
HMCparams.MD = MD;
|
||||
@ -70,7 +70,7 @@ int main(int argc, char **argv) {
|
||||
CheckpointerParameters CPparams;
|
||||
CPparams.config_prefix = "ckpoint_EODWF_lat";
|
||||
CPparams.rng_prefix = "ckpoint_EODWF_rng";
|
||||
CPparams.saveInterval = 10;
|
||||
CPparams.saveInterval = 1;
|
||||
CPparams.format = "IEEE64BIG";
|
||||
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
||||
|
||||
@ -186,6 +186,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// HMC parameters are serialisable
|
||||
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
|
||||
TheHMC.initializeGaugeFieldAndRNGs(U);
|
||||
|
||||
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
|
||||
TheHMC.Run(); // no smearing
|
||||
|
@ -30,11 +30,13 @@ directory
|
||||
#include <string>
|
||||
|
||||
template <class T> void readFile(T& out, std::string const fname){
|
||||
#ifdef HAVE_LIME
|
||||
Grid::emptyUserRecord record;
|
||||
Grid::ScidacReader RD;
|
||||
RD.open(fname);
|
||||
RD.readScidacFieldRecord(out,record);
|
||||
RD.close();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -31,11 +31,13 @@ directory
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
template <class T> void writeFile(T& out, std::string const fname){
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacWriter WR(out.Grid()->IsBoss());
|
||||
WR.open(fname);
|
||||
WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||
WR.close();
|
||||
#endif
|
||||
}
|
||||
NAMESPACE_END(Grid);
|
||||
int main(int argc, char **argv) {
|
||||
|
@ -261,23 +261,25 @@ public:
|
||||
fprintf(FP,"\n\n");
|
||||
};
|
||||
|
||||
|
||||
template<class CComplex>
|
||||
static void BLAS(void)
|
||||
{
|
||||
//int nbasis, int nrhs, int coarseVol
|
||||
int basis[] = { 16,32,64 };
|
||||
int rhs[] = { 8,16,32 };
|
||||
int vol = 4*4*4*4;
|
||||
int rhs[] = { 8,12,16 };
|
||||
int vol = 8*8*8*8;
|
||||
int blk = 4*4*4*4;
|
||||
|
||||
GridBLAS blas;
|
||||
|
||||
|
||||
int fpbits = sizeof(CComplex)*4;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
|
||||
std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
|
||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||
|
||||
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
|
||||
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);
|
||||
|
||||
for(int b=0;b<3;b++){
|
||||
for(int r=0;r<3;r++){
|
||||
@ -285,7 +287,7 @@ public:
|
||||
int N=rhs[r];
|
||||
int K=basis[b];
|
||||
int BATCH=vol;
|
||||
double p=blas.benchmark(M,N,K,BATCH);
|
||||
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||
|
||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||
|
||||
@ -299,9 +301,9 @@ public:
|
||||
for(int r=0;r<3;r++){
|
||||
int M=basis[b];
|
||||
int N=rhs[r];
|
||||
int K=vol;
|
||||
int K=blk;
|
||||
int BATCH=vol;
|
||||
double p=blas.benchmark(M,N,K,BATCH);
|
||||
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||
|
||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||
@ -313,10 +315,10 @@ public:
|
||||
for(int b=0;b<3;b++){
|
||||
for(int r=0;r<3;r++){
|
||||
int M=rhs[r];
|
||||
int N=vol;
|
||||
int N=blk;
|
||||
int K=basis[b];
|
||||
int BATCH=vol;
|
||||
double p=blas.benchmark(M,N,K,BATCH);
|
||||
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||
|
||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||
@ -867,6 +869,7 @@ int main (int argc, char ** argv)
|
||||
int do_memory=1;
|
||||
int do_comms =1;
|
||||
int do_blas =1;
|
||||
int do_dslash=1;
|
||||
|
||||
int sel=4;
|
||||
std::vector<int> L_list({8,12,16,24,32});
|
||||
@ -877,6 +880,7 @@ int main (int argc, char ** argv)
|
||||
std::vector<double> staggered;
|
||||
|
||||
int Ls=1;
|
||||
if (do_dslash){
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
@ -901,6 +905,7 @@ int main (int argc, char ** argv)
|
||||
staggered.push_back(result);
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
@ -909,8 +914,33 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
}
|
||||
|
||||
int NN=NN_global;
|
||||
if(do_dslash){
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
||||
fprintf(FP,"Per node summary table\n");
|
||||
fprintf(FP,"\n");
|
||||
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
||||
fprintf(FP,"\n");
|
||||
for(int l=0;l<L_list.size();l++){
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
||||
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
||||
}
|
||||
fprintf(FP,"\n");
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
||||
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
||||
std::cout<<std::setprecision(3);
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
if ( do_memory ) {
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
|
||||
@ -918,15 +948,6 @@ int main (int argc, char ** argv)
|
||||
Benchmark::Memory();
|
||||
}
|
||||
|
||||
if ( do_blas ) {
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
Benchmark::BLAS();
|
||||
#endif
|
||||
}
|
||||
|
||||
if ( do_su4 ) {
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
|
||||
@ -941,28 +962,14 @@ int main (int argc, char ** argv)
|
||||
Benchmark::Comms();
|
||||
}
|
||||
|
||||
if ( do_blas ) {
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
||||
fprintf(FP,"Per node summary table\n");
|
||||
fprintf(FP,"\n");
|
||||
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
||||
fprintf(FP,"\n");
|
||||
for(int l=0;l<L_list.size();l++){
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
||||
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
||||
}
|
||||
fprintf(FP,"\n");
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
||||
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
||||
std::cout<<std::setprecision(3);
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
|
||||
Benchmark::BLAS<ComplexD>();
|
||||
Benchmark::BLAS<ComplexF>();
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
fclose(FP);
|
||||
}
|
||||
|
@ -1,16 +1,18 @@
|
||||
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-debug \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$CLIME \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=yes\
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx \
|
||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
|
||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
|
||||
CXX=icpx
|
||||
|
||||
|
23
systems/Aurora/config-command-leak
Normal file
23
systems/Aurora/config-command-leak
Normal file
@ -0,0 +1,23 @@
|
||||
source ~/spack/share/spack/setup-env.sh
|
||||
spack load c-lime
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||
|
||||
../../configure \
|
||||
--enable-debug \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$CLIME \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=yes\
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx \
|
||||
LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
|
||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host -fsycl -fsanitize=leak "
|
||||
|
22
systems/Aurora/config-command-sanitize
Normal file
22
systems/Aurora/config-command-sanitize
Normal file
@ -0,0 +1,22 @@
|
||||
# -fsycl-targets=spir64_gen -Xs\" -device pvc \"
|
||||
# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
|
||||
# -fsycl-targets=intel_gpu_pvc
|
||||
|
||||
unset DEVICE
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=address"
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -Xarch_host -fsanitize=address -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-debug \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$CLIME \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=yes\
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx
|
||||
|
@ -1,14 +1,22 @@
|
||||
source ~/spack/share/spack/setup-env.sh
|
||||
spack load c-lime
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
#spack load libefence
|
||||
#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
|
||||
#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
|
||||
#spack load gperftools
|
||||
export TCMALLOC=/home/paboyle/gperftools/install
|
||||
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||
export INTELGT_AUTO_ATTACH_DISABLE=1
|
||||
|
||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
||||
|
||||
module load oneapi/release/2023.12.15.001
|
||||
|
||||
#module load oneapi/release/2023.12.15.001
|
||||
#module use /soft/modulefiles
|
||||
#module load intel_compute_runtime/release/agama-devel-682.22
|
||||
|
||||
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||
export FI_CXI_CQ_FILL_PERCENT=20
|
||||
|
||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
#export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||
#export FI_CXI_CQ_FILL_PERCENT=20
|
||||
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
|
||||
|
||||
#
|
||||
@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
# -ftarget-register-alloc-mode=pvc:small
|
||||
# -ftarget-register-alloc-mode=pvc:large
|
||||
# -ftarget-register-alloc-mode=pvc:auto
|
||||
#
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
|
||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||
|
||||
#source ~/spack/share/spack/setup-env.sh
|
||||
#spack load gperftools
|
||||
#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||
|
||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
|
76
systems/Frontier/benchmarks/Benchmark_usqcd.csv
Normal file
76
systems/Frontier/benchmarks/Benchmark_usqcd.csv
Normal file
@ -0,0 +1,76 @@
|
||||
Memory Bandwidth
|
||||
|
||||
Bytes, GB/s per node
|
||||
6291456, 379.297050
|
||||
100663296, 3754.674992
|
||||
509607936, 6521.472413
|
||||
1610612736, 8513.456479
|
||||
3932160000, 9018.901766
|
||||
|
||||
|
||||
GEMM
|
||||
|
||||
M, N, K, BATCH, GF/s per rank
|
||||
16, 8, 16, 256, 0.564958
|
||||
16, 16, 16, 256, 243.148058
|
||||
16, 32, 16, 256, 440.346877
|
||||
32, 8, 32, 256, 439.194136
|
||||
32, 16, 32, 256, 847.334141
|
||||
32, 32, 32, 256, 1430.892623
|
||||
64, 8, 64, 256, 1242.756741
|
||||
64, 16, 64, 256, 2196.689493
|
||||
64, 32, 64, 256, 3697.458072
|
||||
16, 8, 256, 256, 899.582627
|
||||
16, 16, 256, 256, 1673.537756
|
||||
16, 32, 256, 256, 2959.597089
|
||||
32, 8, 256, 256, 1558.858630
|
||||
32, 16, 256, 256, 2864.839445
|
||||
32, 32, 256, 256, 4810.671254
|
||||
64, 8, 256, 256, 2386.092942
|
||||
64, 16, 256, 256, 4451.665937
|
||||
64, 32, 256, 256, 5942.124095
|
||||
8, 256, 16, 256, 799.867271
|
||||
16, 256, 16, 256, 1584.624888
|
||||
32, 256, 16, 256, 1949.422338
|
||||
8, 256, 32, 256, 1389.417474
|
||||
16, 256, 32, 256, 2668.344493
|
||||
32, 256, 32, 256, 3234.162120
|
||||
8, 256, 64, 256, 2150.925128
|
||||
16, 256, 64, 256, 4012.488132
|
||||
32, 256, 64, 256, 5154.785521
|
||||
|
||||
|
||||
|
||||
Communications
|
||||
|
||||
Packet bytes, direction, GB/s per node
|
||||
4718592, 1, 245.026198
|
||||
4718592, 2, 251.180996
|
||||
4718592, 3, 361.110977
|
||||
4718592, 5, 247.898447
|
||||
4718592, 6, 249.867523
|
||||
4718592, 7, 359.033061
|
||||
15925248, 1, 255.030946
|
||||
15925248, 2, 264.453890
|
||||
15925248, 3, 392.949183
|
||||
15925248, 5, 256.040644
|
||||
15925248, 6, 264.681896
|
||||
15925248, 7, 392.102622
|
||||
37748736, 1, 258.823333
|
||||
37748736, 2, 268.181577
|
||||
37748736, 3, 401.478191
|
||||
37748736, 5, 258.995363
|
||||
37748736, 6, 268.206586
|
||||
37748736, 7, 400.397611
|
||||
|
||||
|
||||
Per node summary table
|
||||
|
||||
L , Wilson, DWF4, Staggered, GF/s per node
|
||||
|
||||
8 , 155, 1386, 50
|
||||
12 , 694, 4208, 230
|
||||
16 , 1841, 6675, 609
|
||||
24 , 3934, 8573, 1641
|
||||
32 , 5083, 9771, 3086
|
||||
|
|
702
systems/Frontier/benchmarks/Benchmark_usqcd.log
Normal file
702
systems/Frontier/benchmarks/Benchmark_usqcd.log
Normal file
@ -0,0 +1,702 @@
|
||||
RANK 1 using GPU 1
|
||||
RANK 5 using GPU 6
|
||||
RANK 0 using GPU 0
|
||||
RANK 2 using GPU 2
|
||||
RANK 3 using GPU 3
|
||||
RANK 6 using GPU 5
|
||||
RANK 7 using GPU 4
|
||||
RANK 4 using GPU 7
|
||||
world_rank 0 has 1 devices
|
||||
AcceleratorHipInit: ========================
|
||||
AcceleratorHipInit: Device Number : 0
|
||||
AcceleratorHipInit: ========================
|
||||
AcceleratorHipInit: Device identifier: AMD Instinct MI250X
|
||||
AcceleratorHipInit: totalGlobalMem: 68702699520
|
||||
AcceleratorHipInit: isMultiGpuBoard: 0
|
||||
AcceleratorHipInit: warpSize: 64
|
||||
AcceleratorHipInit: using default device
|
||||
AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding
|
||||
AcceleratorHipInit: Configure options --enable-setdevice=no
|
||||
local rank 0 device 0 bus id: 0000:c1:00.0
|
||||
AcceleratorHipInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 8
|
||||
SharedMemoryMpi: Node communicator of size 8
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : This rank is running on host frontier01320
|
||||
Grid : Message : Requested 4294967296 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 54962159616 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using hipMalloc
|
||||
Grid : Message : 0.293720 s : ==================================================================================
|
||||
Grid : Message : 0.293790 s : = Grid is setup to use 1 threads
|
||||
Grid : Message : 0.293800 s : ==================================================================================
|
||||
Grid : Message : 0.293810 s : Grid Default Decomposition patterns
|
||||
Grid : Message : 0.293810 s : OpenMP threads : 1
|
||||
Grid : Message : 0.293820 s : MPI tasks : 1 2 2 2
|
||||
Grid : Message : 0.293870 s : vReal : 512bits ; 1 2 2 2
|
||||
Grid : Message : 0.293890 s : vRealF : 512bits ; 2 2 2 2
|
||||
Grid : Message : 0.293910 s : vRealD : 512bits ; 1 2 2 2
|
||||
Grid : Message : 0.293920 s : vComplex : 512bits ; 1 1 2 2
|
||||
Grid : Message : 0.293930 s : vComplexF : 512bits ; 1 2 2 2
|
||||
Grid : Message : 0.293960 s : vComplexD : 512bits ; 1 1 2 2
|
||||
Grid : Message : 0.293970 s : ==================================================================================
|
||||
Grid : Message : 0.293980 s : ==================================================================================
|
||||
Grid : Message : 0.293990 s : Clover dslash 4D vectorised (temporarily Wilson)
|
||||
Grid : Message : 0.294000 s : ==================================================================================
|
||||
Grid : Message : 0.301330 s : ==================================================================================
|
||||
Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume
|
||||
Grid : Message : 0.301370 s : * Nc : 3
|
||||
Grid : Message : 0.301380 s : * Global volume : 8 16 16 16
|
||||
Grid : Message : 0.301410 s : * Ls : 1
|
||||
Grid : Message : 0.301420 s : * ranks : 8
|
||||
Grid : Message : 0.301430 s : * nodes : 1
|
||||
Grid : Message : 0.301440 s : * ranks/node : 8
|
||||
Grid : Message : 0.301450 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 0.301460 s : * Using 1 threads
|
||||
Grid : Message : 0.301470 s : ==================================================================================
|
||||
Grid : Message : 0.345030 s : Initialised RNGs
|
||||
Grid : Message : 0.158302 s : ==================================================================================
|
||||
Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 0.158312 s : * SINGLE precision
|
||||
Grid : Message : 0.158313 s : ==================================================================================
|
||||
Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344
|
||||
Grid : Message : 0.240711 s : Deo mflop/s = 154914.0 (130.8) 139367.7-159565.9
|
||||
Grid : Message : 0.240715 s : Deo mflop/s per rank 19364.3
|
||||
Grid : Message : 0.240716 s : Deo mflop/s per node 154914.0
|
||||
Grid : Message : 0.240718 s : ==================================================================================
|
||||
Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 0.240719 s : * SINGLE precision
|
||||
Grid : Message : 0.240719 s : ==================================================================================
|
||||
Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 0.315033 s : Deo mflop/s = 151459.5 (142.0) 131856.9-157286.4
|
||||
Grid : Message : 0.315036 s : Deo mflop/s per rank 18932.4
|
||||
Grid : Message : 0.315037 s : Deo mflop/s per node 151459.5
|
||||
Grid : Message : 0.315038 s : ==================================================================================
|
||||
Grid : Message : 0.315040 s : 8^4 x 1 Deo Best mflop/s = 154914.0 ; 154914.0 per node
|
||||
Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s = 151459.5 ; 151459.5 per node
|
||||
Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ;
|
||||
Grid : Message : 0.315044 s : ==================================================================================
|
||||
Grid : Message : 0.316507 s : ==================================================================================
|
||||
Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume
|
||||
Grid : Message : 0.316511 s : * Nc : 3
|
||||
Grid : Message : 0.316512 s : * Global volume : 12 24 24 24
|
||||
Grid : Message : 0.316515 s : * Ls : 1
|
||||
Grid : Message : 0.316516 s : * ranks : 8
|
||||
Grid : Message : 0.316517 s : * nodes : 1
|
||||
Grid : Message : 0.316518 s : * ranks/node : 8
|
||||
Grid : Message : 0.316518 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 0.316519 s : * Using 1 threads
|
||||
Grid : Message : 0.316520 s : ==================================================================================
|
||||
Grid : Message : 0.327883 s : Initialised RNGs
|
||||
Grid : Message : 0.786395 s : ==================================================================================
|
||||
Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 0.786406 s : * SINGLE precision
|
||||
Grid : Message : 0.786406 s : ==================================================================================
|
||||
Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 0.871659 s : Deo mflop/s = 684982.2 (632.4) 609162.5-714594.5
|
||||
Grid : Message : 0.871663 s : Deo mflop/s per rank 85622.8
|
||||
Grid : Message : 0.871664 s : Deo mflop/s per node 684982.2
|
||||
Grid : Message : 0.871665 s : ==================================================================================
|
||||
Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 0.871665 s : * SINGLE precision
|
||||
Grid : Message : 0.871665 s : ==================================================================================
|
||||
Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 0.953702 s : Deo mflop/s = 693556.6 (576.5) 663552.0-719204.7
|
||||
Grid : Message : 0.953705 s : Deo mflop/s per rank 86694.6
|
||||
Grid : Message : 0.953706 s : Deo mflop/s per node 693556.6
|
||||
Grid : Message : 0.953707 s : ==================================================================================
|
||||
Grid : Message : 0.953708 s : 12^4 x 1 Deo Best mflop/s = 693556.6 ; 693556.6 per node
|
||||
Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s = 684982.2 ; 684982.2 per node
|
||||
Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ;
|
||||
Grid : Message : 0.953713 s : ==================================================================================
|
||||
Grid : Message : 0.957609 s : ==================================================================================
|
||||
Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume
|
||||
Grid : Message : 0.957614 s : * Nc : 3
|
||||
Grid : Message : 0.957615 s : * Global volume : 16 32 32 32
|
||||
Grid : Message : 0.957620 s : * Ls : 1
|
||||
Grid : Message : 0.957621 s : * ranks : 8
|
||||
Grid : Message : 0.957622 s : * nodes : 1
|
||||
Grid : Message : 0.957623 s : * ranks/node : 8
|
||||
Grid : Message : 0.957623 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 0.957624 s : * Using 1 threads
|
||||
Grid : Message : 0.957625 s : ==================================================================================
|
||||
Grid : Message : 0.985828 s : Initialised RNGs
|
||||
Grid : Message : 2.379761 s : ==================================================================================
|
||||
Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 2.379774 s : * SINGLE precision
|
||||
Grid : Message : 2.379775 s : ==================================================================================
|
||||
Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 2.486725 s : Deo mflop/s = 1803226.1 (1139.4) 1646362.3-1864135.1
|
||||
Grid : Message : 2.486729 s : Deo mflop/s per rank 225403.3
|
||||
Grid : Message : 2.486731 s : Deo mflop/s per node 1803226.1
|
||||
Grid : Message : 2.486732 s : ==================================================================================
|
||||
Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 2.486732 s : * SINGLE precision
|
||||
Grid : Message : 2.486732 s : ==================================================================================
|
||||
Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 2.584412 s : Deo mflop/s = 1840587.3 (1119.6) 1779401.7-1914791.0
|
||||
Grid : Message : 2.584415 s : Deo mflop/s per rank 230073.4
|
||||
Grid : Message : 2.584416 s : Deo mflop/s per node 1840587.3
|
||||
Grid : Message : 2.584417 s : ==================================================================================
|
||||
Grid : Message : 2.584418 s : 16^4 x 1 Deo Best mflop/s = 1840587.3 ; 1840587.3 per node
|
||||
Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s = 1803226.1 ; 1803226.1 per node
|
||||
Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ;
|
||||
Grid : Message : 2.584423 s : ==================================================================================
|
||||
Grid : Message : 2.592858 s : ==================================================================================
|
||||
Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume
|
||||
Grid : Message : 2.592863 s : * Nc : 3
|
||||
Grid : Message : 2.592864 s : * Global volume : 24 48 48 48
|
||||
Grid : Message : 2.592869 s : * Ls : 1
|
||||
Grid : Message : 2.592870 s : * ranks : 8
|
||||
Grid : Message : 2.592871 s : * nodes : 1
|
||||
Grid : Message : 2.592872 s : * ranks/node : 8
|
||||
Grid : Message : 2.592872 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 2.592873 s : * Using 1 threads
|
||||
Grid : Message : 2.592874 s : ==================================================================================
|
||||
Grid : Message : 2.715623 s : Initialised RNGs
|
||||
Grid : Message : 9.608838 s : ==================================================================================
|
||||
Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 9.608854 s : * SINGLE precision
|
||||
Grid : Message : 9.608855 s : ==================================================================================
|
||||
Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 9.870309 s : Deo mflop/s = 3861903.3 (1708.9) 3511078.3-3937368.2
|
||||
Grid : Message : 9.870313 s : Deo mflop/s per rank 482737.9
|
||||
Grid : Message : 9.870314 s : Deo mflop/s per node 3861903.3
|
||||
Grid : Message : 9.870315 s : ==================================================================================
|
||||
Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 9.870317 s : * SINGLE precision
|
||||
Grid : Message : 9.870317 s : ==================================================================================
|
||||
Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 10.101624 s : Deo mflop/s = 3933599.5 (1412.7) 3835758.7-4008152.3
|
||||
Grid : Message : 10.101627 s : Deo mflop/s per rank 491699.9
|
||||
Grid : Message : 10.101628 s : Deo mflop/s per node 3933599.5
|
||||
Grid : Message : 10.101629 s : ==================================================================================
|
||||
Grid : Message : 10.101629 s : 24^4 x 1 Deo Best mflop/s = 3933599.5 ; 3933599.5 per node
|
||||
Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s = 3861903.3 ; 3861903.3 per node
|
||||
Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ;
|
||||
Grid : Message : 10.101634 s : ==================================================================================
|
||||
Grid : Message : 10.139642 s : ==================================================================================
|
||||
Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume
|
||||
Grid : Message : 10.139653 s : * Nc : 3
|
||||
Grid : Message : 10.139654 s : * Global volume : 32 64 64 64
|
||||
Grid : Message : 10.139661 s : * Ls : 1
|
||||
Grid : Message : 10.139661 s : * ranks : 8
|
||||
Grid : Message : 10.139662 s : * nodes : 1
|
||||
Grid : Message : 10.139662 s : * ranks/node : 8
|
||||
Grid : Message : 10.139662 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 10.139663 s : * Using 1 threads
|
||||
Grid : Message : 10.139663 s : ==================================================================================
|
||||
Grid : Message : 10.502161 s : Initialised RNGs
|
||||
Grid : Message : 32.211092 s : ==================================================================================
|
||||
Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 32.211109 s : * SINGLE precision
|
||||
Grid : Message : 32.211110 s : ==================================================================================
|
||||
Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 32.841732 s : Deo mflop/s = 4988499.9 (2722.5) 4244837.8-5120022.3
|
||||
Grid : Message : 32.841736 s : Deo mflop/s per rank 623562.5
|
||||
Grid : Message : 32.841737 s : Deo mflop/s per node 4988499.9
|
||||
Grid : Message : 32.841738 s : ==================================================================================
|
||||
Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 32.841740 s : * SINGLE precision
|
||||
Grid : Message : 32.841740 s : ==================================================================================
|
||||
Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 33.407442 s : Deo mflop/s = 5082758.0 (1883.1) 4971027.0-5205119.6
|
||||
Grid : Message : 33.407446 s : Deo mflop/s per rank 635344.7
|
||||
Grid : Message : 33.407447 s : Deo mflop/s per node 5082758.0
|
||||
Grid : Message : 33.407448 s : ==================================================================================
|
||||
Grid : Message : 33.407448 s : 32^4 x 1 Deo Best mflop/s = 5082758.0 ; 5082758.0 per node
|
||||
Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s = 4988499.9 ; 4988499.9 per node
|
||||
Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ;
|
||||
Grid : Message : 33.407453 s : ==================================================================================
|
||||
Grid : Message : 33.506785 s : ==================================================================================
|
||||
Grid : Message : 33.506798 s : Domain wall dslash 4D vectorised
|
||||
Grid : Message : 33.506799 s : ==================================================================================
|
||||
Grid : Message : 33.530686 s : ==================================================================================
|
||||
Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume
|
||||
Grid : Message : 33.530690 s : * Nc : 3
|
||||
Grid : Message : 33.530691 s : * Global volume : 8 16 16 16
|
||||
Grid : Message : 33.530698 s : * Ls : 12
|
||||
Grid : Message : 33.530699 s : * ranks : 8
|
||||
Grid : Message : 33.530700 s : * nodes : 1
|
||||
Grid : Message : 33.530701 s : * ranks/node : 8
|
||||
Grid : Message : 33.530702 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 33.530703 s : * Using 1 threads
|
||||
Grid : Message : 33.530704 s : ==================================================================================
|
||||
Grid : Message : 33.545465 s : Initialised RNGs
|
||||
Grid : Message : 33.752384 s : ==================================================================================
|
||||
Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 33.752399 s : * SINGLE precision
|
||||
Grid : Message : 33.752400 s : ==================================================================================
|
||||
Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 33.851977 s : Deo mflop/s = 1383287.7 (849.8) 1321205.8-1420651.4
|
||||
Grid : Message : 33.851981 s : Deo mflop/s per rank 172911.0
|
||||
Grid : Message : 33.851983 s : Deo mflop/s per node 1383287.7
|
||||
Grid : Message : 33.851984 s : ==================================================================================
|
||||
Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 33.851984 s : * SINGLE precision
|
||||
Grid : Message : 33.851984 s : ==================================================================================
|
||||
Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 33.949240 s : Deo mflop/s = 1386335.8 (734.6) 1341325.6-1428330.6
|
||||
Grid : Message : 33.949243 s : Deo mflop/s per rank 173292.0
|
||||
Grid : Message : 33.949244 s : Deo mflop/s per node 1386335.8
|
||||
Grid : Message : 33.949245 s : ==================================================================================
|
||||
Grid : Message : 33.949245 s : 8^4 x 12 Deo Best mflop/s = 1386335.8 ; 1386335.8 per node
|
||||
Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s = 1383287.7 ; 1383287.7 per node
|
||||
Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ;
|
||||
Grid : Message : 33.949250 s : ==================================================================================
|
||||
Grid : Message : 33.952789 s : ==================================================================================
|
||||
Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume
|
||||
Grid : Message : 33.952794 s : * Nc : 3
|
||||
Grid : Message : 33.952795 s : * Global volume : 12 24 24 24
|
||||
Grid : Message : 33.952800 s : * Ls : 12
|
||||
Grid : Message : 33.952801 s : * ranks : 8
|
||||
Grid : Message : 33.952802 s : * nodes : 1
|
||||
Grid : Message : 33.952803 s : * ranks/node : 8
|
||||
Grid : Message : 33.952803 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 33.952804 s : * Using 1 threads
|
||||
Grid : Message : 33.952805 s : ==================================================================================
|
||||
Grid : Message : 34.362200 s : Initialised RNGs
|
||||
Grid : Message : 34.969821 s : ==================================================================================
|
||||
Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 34.969834 s : * SINGLE precision
|
||||
Grid : Message : 34.969835 s : ==================================================================================
|
||||
Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 35.135558 s : Deo mflop/s = 4208495.6 (2165.0) 4053699.5-4315228.5
|
||||
Grid : Message : 35.135562 s : Deo mflop/s per rank 526062.0
|
||||
Grid : Message : 35.135563 s : Deo mflop/s per node 4208495.6
|
||||
Grid : Message : 35.135564 s : ==================================================================================
|
||||
Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 35.135565 s : * SINGLE precision
|
||||
Grid : Message : 35.135565 s : ==================================================================================
|
||||
Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 35.299715 s : Deo mflop/s = 4156968.7 (1450.2) 4053699.5-4219939.5
|
||||
Grid : Message : 35.299718 s : Deo mflop/s per rank 519621.1
|
||||
Grid : Message : 35.299719 s : Deo mflop/s per node 4156968.7
|
||||
Grid : Message : 35.299721 s : ==================================================================================
|
||||
Grid : Message : 35.299721 s : 12^4 x 12 Deo Best mflop/s = 4208495.6 ; 4208495.6 per node
|
||||
Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s = 4156968.7 ; 4156968.7 per node
|
||||
Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ;
|
||||
Grid : Message : 35.299726 s : ==================================================================================
|
||||
Grid : Message : 35.309687 s : ==================================================================================
|
||||
Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume
|
||||
Grid : Message : 35.309694 s : * Nc : 3
|
||||
Grid : Message : 35.309695 s : * Global volume : 16 32 32 32
|
||||
Grid : Message : 35.309701 s : * Ls : 12
|
||||
Grid : Message : 35.309702 s : * ranks : 8
|
||||
Grid : Message : 35.309703 s : * nodes : 1
|
||||
Grid : Message : 35.309704 s : * ranks/node : 8
|
||||
Grid : Message : 35.309704 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 35.309705 s : * Using 1 threads
|
||||
Grid : Message : 35.309706 s : ==================================================================================
|
||||
Grid : Message : 35.448780 s : Initialised RNGs
|
||||
Grid : Message : 38.468764 s : ==================================================================================
|
||||
Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 38.468779 s : * SINGLE precision
|
||||
Grid : Message : 38.468780 s : ==================================================================================
|
||||
Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 38.801040 s : Deo mflop/s = 6674673.6 (2168.6) 6484445.4-6797200.1
|
||||
Grid : Message : 38.801044 s : Deo mflop/s per rank 834334.2
|
||||
Grid : Message : 38.801045 s : Deo mflop/s per node 6674673.6
|
||||
Grid : Message : 38.801046 s : ==================================================================================
|
||||
Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 38.801049 s : * SINGLE precision
|
||||
Grid : Message : 38.801049 s : ==================================================================================
|
||||
Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 39.129783 s : Deo mflop/s = 6560128.4 (2117.4) 6405846.1-6679081.3
|
||||
Grid : Message : 39.129786 s : Deo mflop/s per rank 820016.1
|
||||
Grid : Message : 39.129787 s : Deo mflop/s per node 6560128.4
|
||||
Grid : Message : 39.129788 s : ==================================================================================
|
||||
Grid : Message : 39.129788 s : 16^4 x 12 Deo Best mflop/s = 6674673.6 ; 6674673.6 per node
|
||||
Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s = 6560128.4 ; 6560128.4 per node
|
||||
Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ;
|
||||
Grid : Message : 39.129795 s : ==================================================================================
|
||||
Grid : Message : 39.161251 s : ==================================================================================
|
||||
Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume
|
||||
Grid : Message : 39.161266 s : * Nc : 3
|
||||
Grid : Message : 39.161267 s : * Global volume : 24 48 48 48
|
||||
Grid : Message : 39.161274 s : * Ls : 12
|
||||
Grid : Message : 39.161275 s : * ranks : 8
|
||||
Grid : Message : 39.161276 s : * nodes : 1
|
||||
Grid : Message : 39.161277 s : * ranks/node : 8
|
||||
Grid : Message : 39.161277 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 39.161278 s : * Using 1 threads
|
||||
Grid : Message : 39.161279 s : ==================================================================================
|
||||
Grid : Message : 39.911996 s : Initialised RNGs
|
||||
Grid : Message : 54.971914 s : ==================================================================================
|
||||
Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 54.971930 s : * SINGLE precision
|
||||
Grid : Message : 54.971931 s : ==================================================================================
|
||||
Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 56.309462 s : Deo mflop/s = 8572660.7 (1374.9) 8483366.4-8644399.6
|
||||
Grid : Message : 56.309467 s : Deo mflop/s per rank 1071582.6
|
||||
Grid : Message : 56.309468 s : Deo mflop/s per node 8572660.7
|
||||
Grid : Message : 56.309469 s : ==================================================================================
|
||||
Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 56.309473 s : * SINGLE precision
|
||||
Grid : Message : 56.309474 s : ==================================================================================
|
||||
Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 57.640714 s : Deo mflop/s = 8200141.3 (1445.8) 8113545.6-8286307.9
|
||||
Grid : Message : 57.640717 s : Deo mflop/s per rank 1025017.7
|
||||
Grid : Message : 57.640718 s : Deo mflop/s per node 8200141.3
|
||||
Grid : Message : 57.640719 s : ==================================================================================
|
||||
Grid : Message : 57.640720 s : 24^4 x 12 Deo Best mflop/s = 8572660.7 ; 8572660.7 per node
|
||||
Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s = 8200141.3 ; 8200141.3 per node
|
||||
Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ;
|
||||
Grid : Message : 57.640727 s : ==================================================================================
|
||||
Grid : Message : 57.806175 s : ==================================================================================
|
||||
Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume
|
||||
Grid : Message : 57.806191 s : * Nc : 3
|
||||
Grid : Message : 57.806192 s : * Global volume : 32 64 64 64
|
||||
Grid : Message : 57.806200 s : * Ls : 12
|
||||
Grid : Message : 57.806200 s : * ranks : 8
|
||||
Grid : Message : 57.806200 s : * nodes : 1
|
||||
Grid : Message : 57.806200 s : * ranks/node : 8
|
||||
Grid : Message : 57.806200 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 57.806201 s : * Using 1 threads
|
||||
Grid : Message : 57.806201 s : ==================================================================================
|
||||
Grid : Message : 60.313153 s : Initialised RNGs
|
||||
Grid : Message : 107.830286 s : ==================================================================================
|
||||
Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 107.830308 s : * SINGLE precision
|
||||
Grid : Message : 107.830309 s : ==================================================================================
|
||||
Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 111.479625 s : Deo mflop/s = 9771387.8 (1000.8) 9688589.9-9830800.0
|
||||
Grid : Message : 111.479629 s : Deo mflop/s per rank 1221423.5
|
||||
Grid : Message : 111.479630 s : Deo mflop/s per node 9771387.8
|
||||
Grid : Message : 111.479631 s : ==================================================================================
|
||||
Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels
|
||||
Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 111.479631 s : * SINGLE precision
|
||||
Grid : Message : 111.479631 s : ==================================================================================
|
||||
Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0
|
||||
Grid : Message : 115.406573 s : Deo mflop/s = 8785297.3 (1739.6) 8628282.5-8911307.5
|
||||
Grid : Message : 115.406576 s : Deo mflop/s per rank 1098162.2
|
||||
Grid : Message : 115.406577 s : Deo mflop/s per node 8785297.3
|
||||
Grid : Message : 115.406578 s : ==================================================================================
|
||||
Grid : Message : 115.406578 s : 32^4 x 12 Deo Best mflop/s = 9771387.8 ; 9771387.8 per node
|
||||
Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s = 8785297.3 ; 8785297.3 per node
|
||||
Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ;
|
||||
Grid : Message : 115.406582 s : ==================================================================================
|
||||
Grid : Message : 115.918888 s : ==================================================================================
|
||||
Grid : Message : 115.918902 s : Improved Staggered dslash 4D vectorised
|
||||
Grid : Message : 115.918903 s : ==================================================================================
|
||||
Grid : Message : 115.920344 s : ==================================================================================
|
||||
Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume
|
||||
Grid : Message : 115.920347 s : * Global volume : 8 16 16 16
|
||||
Grid : Message : 115.920354 s : * ranks : 8
|
||||
Grid : Message : 115.920355 s : * nodes : 1
|
||||
Grid : Message : 115.920356 s : * ranks/node : 8
|
||||
Grid : Message : 115.920357 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 115.920376 s : * Using 1 threads
|
||||
Grid : Message : 115.920377 s : ==================================================================================
|
||||
Grid : Message : 115.923522 s : Initialised RNGs
|
||||
Grid : Message : 116.904870 s : ==================================================================================
|
||||
Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels
|
||||
Grid : Message : 116.904960 s : * SINGLE precision
|
||||
Grid : Message : 116.904970 s : ==================================================================================
|
||||
Grid : Message : 116.288979 s : Deo mflop/s = 49708.9 (22.9) 44075.3-50609.3
|
||||
Grid : Message : 116.289000 s : Deo mflop/s per rank 6213.6
|
||||
Grid : Message : 116.289002 s : Deo mflop/s per node 49708.9
|
||||
Grid : Message : 116.289003 s : ==================================================================================
|
||||
Grid : Message : 116.289004 s : * SINGLE precision
|
||||
Grid : Message : 116.289005 s : ==================================================================================
|
||||
Grid : Message : 116.481632 s : Deo mflop/s = 49737.1 (13.5) 48517.0-50338.0
|
||||
Grid : Message : 116.481639 s : Deo mflop/s per rank 6217.1
|
||||
Grid : Message : 116.481640 s : Deo mflop/s per node 49737.1
|
||||
Grid : Message : 116.481641 s : ==================================================================================
|
||||
Grid : Message : 116.481642 s : 8^4 Deo Best mflop/s = 49737.1 ; 49737.1 per node
|
||||
Grid : Message : 116.481644 s : 8^4 Deo Worst mflop/s = 49708.9 ; 49708.9 per node
|
||||
Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ;
|
||||
Grid : Message : 116.481647 s : ==================================================================================
|
||||
Grid : Message : 116.483458 s : ==================================================================================
|
||||
Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume
|
||||
Grid : Message : 116.483462 s : * Global volume : 12 24 24 24
|
||||
Grid : Message : 116.483465 s : * ranks : 8
|
||||
Grid : Message : 116.483466 s : * nodes : 1
|
||||
Grid : Message : 116.483466 s : * ranks/node : 8
|
||||
Grid : Message : 116.483466 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 116.483467 s : * Using 1 threads
|
||||
Grid : Message : 116.483468 s : ==================================================================================
|
||||
Grid : Message : 116.489279 s : Initialised RNGs
|
||||
Grid : Message : 116.945016 s : ==================================================================================
|
||||
Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels
|
||||
Grid : Message : 116.945026 s : * SINGLE precision
|
||||
Grid : Message : 116.945027 s : ==================================================================================
|
||||
Grid : Message : 117.159821 s : Deo mflop/s = 229778.4 (89.5) 223656.1-233547.5
|
||||
Grid : Message : 117.159835 s : Deo mflop/s per rank 28722.3
|
||||
Grid : Message : 117.159837 s : Deo mflop/s per node 229778.4
|
||||
Grid : Message : 117.159838 s : ==================================================================================
|
||||
Grid : Message : 117.159838 s : * SINGLE precision
|
||||
Grid : Message : 117.159838 s : ==================================================================================
|
||||
Grid : Message : 117.371102 s : Deo mflop/s = 229516.6 (61.8) 225781.1-233547.5
|
||||
Grid : Message : 117.371109 s : Deo mflop/s per rank 28689.6
|
||||
Grid : Message : 117.371110 s : Deo mflop/s per node 229516.6
|
||||
Grid : Message : 117.371111 s : ==================================================================================
|
||||
Grid : Message : 117.371111 s : 12^4 Deo Best mflop/s = 229778.4 ; 229778.4 per node
|
||||
Grid : Message : 117.371113 s : 12^4 Deo Worst mflop/s = 229516.6 ; 229516.6 per node
|
||||
Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ;
|
||||
Grid : Message : 117.371116 s : ==================================================================================
|
||||
Grid : Message : 117.373669 s : ==================================================================================
|
||||
Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume
|
||||
Grid : Message : 117.373674 s : * Global volume : 16 32 32 32
|
||||
Grid : Message : 117.373678 s : * ranks : 8
|
||||
Grid : Message : 117.373679 s : * nodes : 1
|
||||
Grid : Message : 117.373679 s : * ranks/node : 8
|
||||
Grid : Message : 117.373679 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 117.373680 s : * Using 1 threads
|
||||
Grid : Message : 117.373681 s : ==================================================================================
|
||||
Grid : Message : 117.386495 s : Initialised RNGs
|
||||
Grid : Message : 118.755695 s : ==================================================================================
|
||||
Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels
|
||||
Grid : Message : 118.755707 s : * SINGLE precision
|
||||
Grid : Message : 118.755708 s : ==================================================================================
|
||||
Grid : Message : 119.178990 s : Deo mflop/s = 608844.0 (126.1) 596065.5-615608.7
|
||||
Grid : Message : 119.179160 s : Deo mflop/s per rank 76105.5
|
||||
Grid : Message : 119.179180 s : Deo mflop/s per node 608844.0
|
||||
Grid : Message : 119.179190 s : ==================================================================================
|
||||
Grid : Message : 119.179200 s : * SINGLE precision
|
||||
Grid : Message : 119.179200 s : ==================================================================================
|
||||
Grid : Message : 119.271093 s : Deo mflop/s = 605259.7 (188.7) 591372.1-614349.7
|
||||
Grid : Message : 119.271101 s : Deo mflop/s per rank 75657.5
|
||||
Grid : Message : 119.271103 s : Deo mflop/s per node 605259.7
|
||||
Grid : Message : 119.271104 s : ==================================================================================
|
||||
Grid : Message : 119.271105 s : 16^4 Deo Best mflop/s = 608844.0 ; 608844.0 per node
|
||||
Grid : Message : 119.271107 s : 16^4 Deo Worst mflop/s = 605259.7 ; 605259.7 per node
|
||||
Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ;
|
||||
Grid : Message : 119.271110 s : ==================================================================================
|
||||
Grid : Message : 119.275303 s : ==================================================================================
|
||||
Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume
|
||||
Grid : Message : 119.275309 s : * Global volume : 24 48 48 48
|
||||
Grid : Message : 119.275315 s : * ranks : 8
|
||||
Grid : Message : 119.275316 s : * nodes : 1
|
||||
Grid : Message : 119.275317 s : * ranks/node : 8
|
||||
Grid : Message : 119.275317 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 119.275318 s : * Using 1 threads
|
||||
Grid : Message : 119.275319 s : ==================================================================================
|
||||
Grid : Message : 119.328765 s : Initialised RNGs
|
||||
Grid : Message : 126.866160 s : ==================================================================================
|
||||
Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels
|
||||
Grid : Message : 126.866280 s : * SINGLE precision
|
||||
Grid : Message : 126.866290 s : ==================================================================================
|
||||
Grid : Message : 126.604376 s : Deo mflop/s = 1641161.6 (335.5) 1619660.5-1663961.9
|
||||
Grid : Message : 126.604392 s : Deo mflop/s per rank 205145.2
|
||||
Grid : Message : 126.604394 s : Deo mflop/s per node 1641161.6
|
||||
Grid : Message : 126.604395 s : ==================================================================================
|
||||
Grid : Message : 126.604396 s : * SINGLE precision
|
||||
Grid : Message : 126.604396 s : ==================================================================================
|
||||
Grid : Message : 127.829420 s : Deo mflop/s = 1620972.4 (344.9) 1602593.4-1644174.3
|
||||
Grid : Message : 127.829520 s : Deo mflop/s per rank 202621.6
|
||||
Grid : Message : 127.829530 s : Deo mflop/s per node 1620972.4
|
||||
Grid : Message : 127.829540 s : ==================================================================================
|
||||
Grid : Message : 127.829550 s : 24^4 Deo Best mflop/s = 1641161.6 ; 1641161.6 per node
|
||||
Grid : Message : 127.829570 s : 24^4 Deo Worst mflop/s = 1620972.4 ; 1620972.4 per node
|
||||
Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ;
|
||||
Grid : Message : 127.829600 s : ==================================================================================
|
||||
Grid : Message : 127.107891 s : ==================================================================================
|
||||
Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume
|
||||
Grid : Message : 127.107904 s : * Global volume : 32 64 64 64
|
||||
Grid : Message : 127.107912 s : * ranks : 8
|
||||
Grid : Message : 127.107913 s : * nodes : 1
|
||||
Grid : Message : 127.107914 s : * ranks/node : 8
|
||||
Grid : Message : 127.107914 s : * ranks geom : 1 2 2 2
|
||||
Grid : Message : 127.107915 s : * Using 1 threads
|
||||
Grid : Message : 127.107916 s : ==================================================================================
|
||||
Grid : Message : 127.257116 s : Initialised RNGs
|
||||
Grid : Message : 148.527930 s : ==================================================================================
|
||||
Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels
|
||||
Grid : Message : 148.527942 s : * SINGLE precision
|
||||
Grid : Message : 148.527943 s : ==================================================================================
|
||||
Grid : Message : 149.401625 s : Deo mflop/s = 3085543.7 (956.0) 2934476.4-3115147.4
|
||||
Grid : Message : 149.401643 s : Deo mflop/s per rank 385693.0
|
||||
Grid : Message : 149.401645 s : Deo mflop/s per node 3085543.7
|
||||
Grid : Message : 149.401646 s : ==================================================================================
|
||||
Grid : Message : 149.401647 s : * SINGLE precision
|
||||
Grid : Message : 149.401648 s : ==================================================================================
|
||||
Grid : Message : 150.204533 s : Deo mflop/s = 3053468.5 (343.9) 3030688.8-3077255.0
|
||||
Grid : Message : 150.204540 s : Deo mflop/s per rank 381683.6
|
||||
Grid : Message : 150.204541 s : Deo mflop/s per node 3053468.5
|
||||
Grid : Message : 150.204542 s : ==================================================================================
|
||||
Grid : Message : 150.204543 s : 32^4 Deo Best mflop/s = 3085543.7 ; 3085543.7 per node
|
||||
Grid : Message : 150.204545 s : 32^4 Deo Worst mflop/s = 3053468.5 ; 3053468.5 per node
|
||||
Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
|
||||
Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ;
|
||||
Grid : Message : 150.204548 s : ==================================================================================
|
||||
Grid : Message : 150.292848 s : ==================================================================================
|
||||
Grid : Message : 150.292864 s : Summary table Ls=12
|
||||
Grid : Message : 150.292866 s : ==================================================================================
|
||||
Grid : Message : 150.292866 s : L Clover DWF4 Staggered
|
||||
Grid : Message : 150.292867 s : 8 154914.0 1386335.8 49737.1
|
||||
Grid : Message : 150.292880 s : 12 693556.6 4208495.6 229778.4
|
||||
Grid : Message : 150.292882 s : 16 1840587.3 6674673.6 608844.0
|
||||
Grid : Message : 150.292884 s : 24 3933599.5 8572660.7 1641161.6
|
||||
Grid : Message : 150.292886 s : 32 5082758.0 9771387.8 3085543.7
|
||||
Grid : Message : 150.292888 s : ==================================================================================
|
||||
Grid : Message : 150.292888 s : ==================================================================================
|
||||
Grid : Message : 150.292888 s : Memory benchmark
|
||||
Grid : Message : 150.292888 s : ==================================================================================
|
||||
Grid : Message : 150.295495 s : ==================================================================================
|
||||
Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth
|
||||
Grid : Message : 150.295498 s : ==================================================================================
|
||||
Grid : Message : 150.295499 s : L bytes GB/s Gflop/s seconds GB/s / node
|
||||
Grid : Message : 150.295500 s : ----------------------------------------------------------
|
||||
Grid : Message : 160.682233 s : 8 6291456.000 379.297 31.608 10.367 379.297
|
||||
Grid : Message : 161.851979 s : 16 100663296.000 3754.675 312.890 1.047 3754.675
|
||||
Grid : Message : 162.458098 s : 24 509607936.000 6521.472 543.456 0.603 6521.472
|
||||
Grid : Message : 162.924116 s : 32 1610612736.000 8513.456 709.455 0.462 8513.456
|
||||
Grid : Message : 163.363877 s : 40 3932160000.000 9018.902 751.575 0.436 9018.902
|
||||
Grid : Message : 163.363976 s : ==================================================================================
|
||||
Grid : Message : 163.363978 s : Batched BLAS benchmark
|
||||
Grid : Message : 163.363979 s : ==================================================================================
|
||||
hipblasCreate
|
||||
Grid : Message : 163.364046 s : ==================================================================================
|
||||
Grid : Message : 163.364048 s : = batched GEMM (double precision)
|
||||
Grid : Message : 163.364048 s : ==================================================================================
|
||||
Grid : Message : 163.364048 s : M N K Gflop/s / rank (coarse mrhs)
|
||||
Grid : Message : 163.364049 s : ----------------------------------------------------------
|
||||
Grid : Message : 163.438476 s : 16 8 16 256 0.565
|
||||
Grid : Message : 163.438944 s : 16 16 16 256 243.148
|
||||
Grid : Message : 163.439501 s : 16 32 16 256 440.347
|
||||
Grid : Message : 163.440003 s : 32 8 32 256 439.194
|
||||
Grid : Message : 163.440463 s : 32 16 32 256 847.334
|
||||
Grid : Message : 163.441051 s : 32 32 32 256 1430.893
|
||||
Grid : Message : 163.441679 s : 64 8 64 256 1242.757
|
||||
Grid : Message : 163.442354 s : 64 16 64 256 2196.689
|
||||
Grid : Message : 163.443196 s : 64 32 64 256 3697.458
|
||||
Grid : Message : 163.443200 s : ----------------------------------------------------------
|
||||
Grid : Message : 163.443201 s : M N K Gflop/s / rank (block project)
|
||||
Grid : Message : 163.443202 s : ----------------------------------------------------------
|
||||
Grid : Message : 163.444013 s : 16 8 256 256 899.583
|
||||
Grid : Message : 163.444933 s : 16 16 256 256 1673.538
|
||||
Grid : Message : 163.446013 s : 16 32 256 256 2959.597
|
||||
Grid : Message : 163.446951 s : 32 8 256 256 1558.859
|
||||
Grid : Message : 163.447970 s : 32 16 256 256 2864.839
|
||||
Grid : Message : 163.449240 s : 32 32 256 256 4810.671
|
||||
Grid : Message : 163.450524 s : 64 8 256 256 2386.093
|
||||
Grid : Message : 163.451877 s : 64 16 256 256 4451.666
|
||||
Grid : Message : 163.453806 s : 64 32 256 256 5942.124
|
||||
Grid : Message : 163.453809 s : ----------------------------------------------------------
|
||||
Grid : Message : 163.453810 s : M N K Gflop/s / rank (block promote)
|
||||
Grid : Message : 163.453811 s : ----------------------------------------------------------
|
||||
Grid : Message : 163.454716 s : 8 256 16 256 799.867
|
||||
Grid : Message : 163.455690 s : 16 256 16 256 1584.625
|
||||
Grid : Message : 163.457209 s : 32 256 16 256 1949.422
|
||||
Grid : Message : 163.458254 s : 8 256 32 256 1389.417
|
||||
Grid : Message : 163.459339 s : 16 256 32 256 2668.344
|
||||
Grid : Message : 163.461158 s : 32 256 32 256 3234.162
|
||||
Grid : Message : 163.462566 s : 8 256 64 256 2150.925
|
||||
Grid : Message : 163.464066 s : 16 256 64 256 4012.488
|
||||
Grid : Message : 163.466272 s : 32 256 64 256 5154.786
|
||||
Grid : Message : 163.466276 s : ==================================================================================
|
||||
Grid : Message : 163.466277 s : ==================================================================================
|
||||
Grid : Message : 163.466278 s : Communications benchmark
|
||||
Grid : Message : 163.466279 s : ==================================================================================
|
||||
Grid : Message : 163.466280 s : ====================================================================================================
|
||||
Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions
|
||||
Grid : Message : 163.466281 s : ====================================================================================================
|
||||
Grid : Message : 163.466281 s : L Ls bytes MB/s uni MB/s bidi
|
||||
Grid : Message : 163.521339 s : 16 12 4718592 122513.099 245026.198
|
||||
Grid : Message : 163.551417 s : 16 12 4718592 125590.498 251180.996
|
||||
Grid : Message : 163.572339 s : 16 12 4718592 180555.489 361110.977
|
||||
Grid : Message : 163.602810 s : 16 12 4718592 123949.223 247898.447
|
||||
Grid : Message : 163.633041 s : 16 12 4718592 124933.761 249867.523
|
||||
Grid : Message : 163.654084 s : 16 12 4718592 179516.530 359033.061
|
||||
Grid : Message : 163.756280 s : 24 12 15925248 127515.473 255030.946
|
||||
Grid : Message : 163.852651 s : 24 12 15925248 132226.945 264453.890
|
||||
Grid : Message : 163.917510 s : 24 12 15925248 196474.591 392949.183
|
||||
Grid : Message : 164.170390 s : 24 12 15925248 128020.322 256040.644
|
||||
Grid : Message : 164.113321 s : 24 12 15925248 132340.948 264681.896
|
||||
Grid : Message : 164.178314 s : 24 12 15925248 196051.311 392102.622
|
||||
Grid : Message : 164.413983 s : 32 12 37748736 129411.666 258823.333
|
||||
Grid : Message : 164.639218 s : 32 12 37748736 134090.789 268181.577
|
||||
Grid : Message : 164.789675 s : 32 12 37748736 200739.096 401478.191
|
||||
Grid : Message : 165.228910 s : 32 12 37748736 129497.681 258995.363
|
||||
Grid : Message : 165.248096 s : 32 12 37748736 134103.293 268206.586
|
||||
Grid : Message : 165.398958 s : 32 12 37748736 200198.805 400397.611
|
||||
Grid : Message : 165.399411 s : ==================================================================================
|
||||
Grid : Message : 165.399413 s : Per Node Summary table Ls=12
|
||||
Grid : Message : 165.399414 s : ==================================================================================
|
||||
Grid : Message : 165.399414 s : L Clover DWF4 Staggered (GF/s per node)
|
||||
Grid : Message : 165.399417 s : 8 154914.003 1386335.817 49737.127
|
||||
Grid : Message : 165.399423 s : 12 693556.579 4208495.611 229778.435
|
||||
Grid : Message : 165.399426 s : 16 1840587.280 6674673.647 608844.000
|
||||
Grid : Message : 165.399429 s : 24 3933599.545 8572660.656 1641161.613
|
||||
Grid : Message : 165.399432 s : 32 5082757.996 9771387.820 3085543.742
|
||||
Grid : Message : 165.399435 s : ==================================================================================
|
||||
Grid : Message : 165.399435 s : ==================================================================================
|
||||
Grid : Message : 165.399435 s : Comparison point result: 9172024.238 Mflop/s per node
|
||||
Grid : Message : 165.399436 s : Comparison point is 0.5*(9771387.820+8572660.656)
|
||||
Grid : Message : 165.399438 s : ==================================================================================
|
||||
Grid : Message : 165.399438 s : *******************************************
|
||||
Grid : Message : 165.399438 s : ******* Grid Finalize ******
|
||||
Grid : Message : 165.399438 s : *******************************************
|
38
systems/Frontier/benchmarks/benchusqcd.slurm
Executable file
38
systems/Frontier/benchmarks/benchusqcd.slurm
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/bash -l
|
||||
#SBATCH --job-name=bench
|
||||
##SBATCH --partition=small-g
|
||||
##SBATCH -q debug
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --cpus-per-task=7
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --account=phy157_dwf
|
||||
#SBATCH --gpu-bind=none
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --mem=0
|
||||
|
||||
cat << EOF > select_gpu
|
||||
#!/bin/bash
|
||||
export GPU_MAP=(0 1 2 3 7 6 5 4)
|
||||
export NUMA_MAP=(3 3 1 1 2 2 0 0)
|
||||
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
|
||||
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
|
||||
export HIP_VISIBLE_DEVICES=\$GPU
|
||||
unset ROCR_VISIBLE_DEVICES
|
||||
echo RANK \$SLURM_LOCALID using GPU \$GPU
|
||||
exec numactl -m \$NUMA -N \$NUMA \$*
|
||||
EOF
|
||||
|
||||
chmod +x ./select_gpu
|
||||
|
||||
root=$HOME/Frontier/Grid/systems/Frontier/
|
||||
source ${root}/sourceme.sh
|
||||
|
||||
export OMP_NUM_THREADS=7
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
|
||||
srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol > Benchmark_usqcd.log
|
||||
|
||||
|
@ -3,7 +3,7 @@ spack load c-lime
|
||||
module load emacs
|
||||
module load PrgEnv-gnu
|
||||
module load rocm
|
||||
module load cray-mpich/8.1.23
|
||||
module load cray-mpich
|
||||
module load gmp
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
||||
|
@ -2,11 +2,11 @@
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-shm=nvlink \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-accelerator=cuda \
|
||||
--enable-gen-simd-width=64 \
|
||||
--disable-gparity \
|
||||
--with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
|
||||
--enable-accelerator-cshift \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
||||
LDFLAGS="-cudart shared -lcublas " \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611"
|
||||
|
@ -1,6 +1,7 @@
|
||||
module load cuda/11.4.1 openmpi/4.1.1-cuda11.4.1 ucx/1.12.0-cuda11.4.1
|
||||
#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
|
||||
export PREFIX=/home/tc002/tc002/shared/env/prefix/
|
||||
export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
|
||||
module load cuda/12.3
|
||||
module load ucx/1.15.0-cuda12.3
|
||||
module load openmpi/4.1.5-cuda12.3
|
||||
source /home/dp207/dp207/shared/env/production/env-base.sh
|
||||
source /home/dp207/dp207/shared/env/production/env-gpu.sh
|
||||
unset SBATCH_EXPORT
|
||||
|
||||
|
@ -142,7 +142,9 @@ int main (int argc, char ** argv)
|
||||
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
|
||||
int iters;
|
||||
|
||||
time_t now;
|
||||
time_t start = time(NULL);
|
||||
UGrid->Broadcast(0,(void *)&start,sizeof(start));
|
||||
|
||||
FlightRecorder::ContinueOnFail = 0;
|
||||
FlightRecorder::PrintEntireLog = 0;
|
||||
@ -162,9 +164,9 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
|
||||
result_o = Zero();
|
||||
t1=usecond();
|
||||
t1=usecond();
|
||||
mCG(src_o,result_o);
|
||||
t2=usecond();
|
||||
t2=usecond();
|
||||
iters = mCG.TotalInnerIterations; //Number of inner CG iterations
|
||||
flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
|
||||
flops+= CGsiteflops*FrbGrid->gSites()*iters;
|
||||
@ -176,7 +178,8 @@ int main (int argc, char ** argv)
|
||||
|
||||
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||
iter ++;
|
||||
} while (time(NULL) < (start + nsecs/10) );
|
||||
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
|
||||
} while (now < (start + nsecs/10) );
|
||||
|
||||
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
|
||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||
@ -189,7 +192,7 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
|
||||
result_o_2 = Zero();
|
||||
t1=usecond();
|
||||
t1=usecond();
|
||||
CG(HermOpEO,src_o,result_o_2);
|
||||
t2=usecond();
|
||||
iters = CG.IterationsToComplete;
|
||||
@ -201,8 +204,9 @@ int main (int argc, char ** argv)
|
||||
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
||||
assert(FlightRecorder::ErrorCount()==0);
|
||||
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
|
||||
i++;
|
||||
} while (time(NULL) < (start + nsecs) );
|
||||
} while (now < (start + nsecs) );
|
||||
|
||||
LatticeFermionD diff_o(FrbGrid);
|
||||
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
||||
|
118
tests/debug/Test_8888.cc
Normal file
118
tests/debug/Test_8888.cc
Normal file
@ -0,0 +1,118 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_general_coarse_hdcg.cc
|
||||
|
||||
Copyright (C) 2023
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
const int Ls=8;
|
||||
const int nbasis = 40;
|
||||
const int cb = 0 ;
|
||||
RealD mass=0.01;
|
||||
RealD M5=1.8;
|
||||
RealD b=1.0;
|
||||
RealD c=0.0;
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
///////////////////////// RNGs /////////////////////////////////
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
std::vector<int> cseeds({5,6,7,8});
|
||||
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
///////////////////////// Configuration /////////////////////////////////
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
|
||||
FieldMetaData header;
|
||||
std::string file("ckpoint_EODWF_lat.125");
|
||||
NerscIO::readConfiguration(Umu,header,file);
|
||||
|
||||
//////////////////////// Fermion action //////////////////////////////////
|
||||
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
|
||||
|
||||
MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf);
|
||||
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Fine Power method "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
LatticeFermionD pm_src(FGrid);
|
||||
pm_src = ComplexD(1.0);
|
||||
PowerMethod<LatticeFermionD> fPM;
|
||||
fPM(HermOp,pm_src);
|
||||
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Fine Lanczos (poly, low) "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
int Nk=80;
|
||||
int Nm=Nk*3;
|
||||
int Nstop=8;
|
||||
int Nconv_test_interval=1;
|
||||
|
||||
// Chebyshev<LatticeFermionD> IRLChebyLo(0.2,64.0,201); // 1 iter
|
||||
Chebyshev<LatticeFermionD> IRLChebyLo(0.0,55.0,101); // 1 iter
|
||||
FunctionHermOp<LatticeFermionD> PolyOp(IRLChebyLo,HermOp);
|
||||
PlainHermOp<LatticeFermionD> Op(HermOp);
|
||||
|
||||
ImplicitlyRestartedLanczos IRL(PolyOp,
|
||||
Op,
|
||||
Nk, // sought vecs
|
||||
Nk, // sought vecs
|
||||
Nm, // spare vecs
|
||||
1.0e-8,
|
||||
10 // Max iterations
|
||||
);
|
||||
|
||||
int Nconv;
|
||||
std::vector<RealD> eval(Nm);
|
||||
std::vector<LatticeFermionD> evec(Nm,FGrid);
|
||||
LatticeFermionD irl_src(FGrid);
|
||||
|
||||
IRL.calc(eval,evec,irl_src,Nconv);
|
||||
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
@ -392,9 +392,27 @@ void TestCGschur(What & Ddwf,
|
||||
GridParallelRNG *RNG5)
|
||||
{
|
||||
LatticeFermion src (FGrid); random(*RNG5,src);
|
||||
LatticeFermion result(FGrid); result=Zero();
|
||||
LatticeFermion result1(FGrid); result1=Zero();
|
||||
LatticeFermion result2(FGrid); result2=Zero();
|
||||
LatticeFermion result3(FGrid); result3=Zero();
|
||||
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
|
||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
|
||||
SchurSolver(Ddwf,src,result);
|
||||
SchurSolver(Ddwf,src,result1);
|
||||
|
||||
SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
|
||||
SchurSolverSymm1(Ddwf,src,result2);
|
||||
|
||||
SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
|
||||
SchurSolverSymm2(Ddwf,src,result3);
|
||||
|
||||
std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << " Symm1 " <<norm2(result2)<<std::endl;
|
||||
result2=result2-result1;
|
||||
std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << " Symm2 " <<norm2(result3)<<std::endl;
|
||||
result3=result3-result1;
|
||||
std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl;
|
||||
}
|
||||
|
@ -244,7 +244,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||
|
||||
|
||||
#if 0
|
||||
MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
|
||||
typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
|
||||
|
||||
@ -307,7 +307,8 @@ int main (int argc, char ** argv)
|
||||
rh_res= Zero();
|
||||
mrhsCG(MrhsCoarseOp,rh_src,rh_res);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||
|
@ -1,4 +1,4 @@
|
||||
/*************************************************************************************
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/lattice/PaddedCell.h>
|
||||
#include <Grid/stencil/GeneralLocalStencil.h>
|
||||
//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
|
||||
#include <Grid/algorithms/iterative/AdefGeneric.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class Coarsened>
|
||||
void SaveOperator(Coarsened &Operator,std::string file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacWriter WR(Operator.Grid()->IsBoss());
|
||||
assert(Operator._A.size()==Operator.geom.npoint);
|
||||
WR.open(file);
|
||||
for(int p=0;p<Operator._A.size();p++){
|
||||
auto tmp = Operator.Cell.Extract(Operator._A[p]);
|
||||
WR.writeScidacFieldRecord(tmp,record);
|
||||
}
|
||||
WR.close();
|
||||
#endif
|
||||
}
|
||||
template<class Coarsened>
|
||||
void LoadOperator(Coarsened &Operator,std::string file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
Grid::ScidacReader RD ;
|
||||
RD.open(file);
|
||||
assert(Operator._A.size()==Operator.geom.npoint);
|
||||
for(int p=0;p<Operator.geom.npoint;p++){
|
||||
conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
|
||||
RD.readScidacFieldRecord(Operator._A[p],record);
|
||||
}
|
||||
RD.close();
|
||||
Operator.ExchangeCoarseLinks();
|
||||
#endif
|
||||
}
|
||||
template<class aggregation>
|
||||
void SaveBasis(aggregation &Agg,std::string file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacWriter WR(Agg.FineGrid->IsBoss());
|
||||
WR.open(file);
|
||||
for(int b=0;b<Agg.subspace.size();b++){
|
||||
WR.writeScidacFieldRecord(Agg.subspace[b],record);
|
||||
}
|
||||
WR.close();
|
||||
#endif
|
||||
}
|
||||
template<class aggregation>
|
||||
void LoadBasis(aggregation &Agg, std::string file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacReader RD ;
|
||||
RD.open(file);
|
||||
for(int b=0;b<Agg.subspace.size();b++){
|
||||
RD.readScidacFieldRecord(Agg.subspace[b],record);
|
||||
}
|
||||
RD.close();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template<class Field> class TestSolver : public LinearFunction<Field> {
|
||||
public:
|
||||
TestSolver() {};
|
||||
void operator() (const Field &in, Field &out){ out = Zero(); }
|
||||
};
|
||||
|
||||
|
||||
RealD InverseApproximation(RealD x){
|
||||
return 1.0/x;
|
||||
}
|
||||
|
||||
// Want Op in CoarsenOp to call MatPcDagMatPc
|
||||
template<class Field>
|
||||
class HermOpAdaptor : public LinearOperatorBase<Field>
|
||||
@ -119,33 +48,37 @@ public:
|
||||
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
|
||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||
};
|
||||
template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
|
||||
|
||||
template<class Field> class CGSmoother : public LinearFunction<Field>
|
||||
{
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
typedef LinearOperatorBase<Field> FineOperator;
|
||||
FineOperator & _SmootherOperator;
|
||||
Chebyshev<Field> Cheby;
|
||||
ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
|
||||
int iters;
|
||||
CGSmoother(int _iters, FineOperator &SmootherOperator) :
|
||||
_SmootherOperator(SmootherOperator),
|
||||
Cheby(_lo,_hi,_ord,InverseApproximation)
|
||||
iters(_iters)
|
||||
{
|
||||
std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
|
||||
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
|
||||
};
|
||||
void operator() (const Field &in, Field &out)
|
||||
{
|
||||
Field tmp(in.Grid());
|
||||
tmp = in;
|
||||
Cheby(_SmootherOperator,tmp,out);
|
||||
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
|
||||
|
||||
out=Zero();
|
||||
|
||||
CG(_SmootherOperator,in,out);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
const int Ls=24;
|
||||
const int nbasis = 40;
|
||||
const int nbasis = 60;
|
||||
const int cb = 0 ;
|
||||
RealD mass=0.00078;
|
||||
RealD M5=1.8;
|
||||
@ -160,10 +93,12 @@ int main (int argc, char ** argv)
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
// Construct a coarsened grid with 4^4 cell
|
||||
Coordinate Block({4,4,4,4});
|
||||
Coordinate clatt = GridDefaultLatt();
|
||||
for(int d=0;d<clatt.size();d++){
|
||||
clatt[d] = clatt[d]/4;
|
||||
clatt[d] = clatt[d]/Block[d];
|
||||
}
|
||||
|
||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());;
|
||||
@ -182,7 +117,7 @@ int main (int argc, char ** argv)
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
|
||||
FieldMetaData header;
|
||||
std::string file("ckpoint_lat.4000");
|
||||
std::string file("ckpoint_EODWF_lat.125");
|
||||
NerscIO::readConfiguration(Umu,header,file);
|
||||
|
||||
//////////////////////// Fermion action //////////////////////////////////
|
||||
@ -192,15 +127,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
|
||||
HermFineMatrix FineHermOp(HermOpEO);
|
||||
|
||||
LatticeFermion result(FrbGrid); result=Zero();
|
||||
|
||||
LatticeFermion src(FrbGrid); random(RNG5,src);
|
||||
|
||||
// Run power method on FineHermOp
|
||||
PowerMethod<LatticeFermion> PM; PM(HermOpEO,src);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
///////////// Coarse basis and Little Dirac Operator ///////
|
||||
////////////////////////////////////////////////////////////
|
||||
@ -208,219 +135,170 @@ int main (int argc, char ** argv)
|
||||
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
||||
|
||||
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
|
||||
NearestStencilGeometry5D geom_nn(Coarse5d);
|
||||
|
||||
// Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
|
||||
|
||||
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
||||
Subspace Aggregates(Coarse5d,FrbGrid,cb);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Need to check about red-black grid coarsening
|
||||
////////////////////////////////////////////////////////////
|
||||
LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
|
||||
|
||||
bool load=false;
|
||||
if ( load ) {
|
||||
LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
|
||||
LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
|
||||
} else {
|
||||
Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
|
||||
95.0,0.1,
|
||||
// 400,200,200 -- 48 iters
|
||||
// 600,200,200 -- 38 iters, 162s
|
||||
// 600,200,100 -- 38 iters, 169s
|
||||
// 600,200,50 -- 88 iters. 370s
|
||||
800,
|
||||
200,
|
||||
100,
|
||||
0.0);
|
||||
LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
|
||||
SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
|
||||
SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
|
||||
}
|
||||
|
||||
// Try projecting to one hop only
|
||||
LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
|
||||
LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
|
||||
int refine=1;
|
||||
// Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
|
||||
// 0.0003,1.0e-5,2000); // Lo, tol, maxit
|
||||
// Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Create Subspace"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);
|
||||
|
||||
typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
|
||||
HermMatrix CoarseOp (LittleDiracOp);
|
||||
HermMatrix CoarseOpProj (LittleDiracOpProj);
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Refine Subspace"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Build a coarse lanczos
|
||||
//////////////////////////////////////////
|
||||
Chebyshev<CoarseVector> IRLCheby(0.2,40.0,71); // 1 iter
|
||||
FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
|
||||
PlainHermOp<CoarseVector> IRLOp (CoarseOp);
|
||||
int Nk=48;
|
||||
int Nm=64;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Coarsen after refine"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
Aggregates.Orthogonalise();
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Building MultiRHS Coarse operator"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
ConjugateGradient<CoarseVector> coarseCG(4.0e-2,20000,true);
|
||||
|
||||
const int nrhs=12;
|
||||
|
||||
Coordinate mpi=GridDefaultMpi();
|
||||
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
||||
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
|
||||
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
|
||||
|
||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||
typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
|
||||
MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
|
||||
|
||||
mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Coarse Lanczos "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
||||
Chebyshev<CoarseVector> IRLCheby(0.01,42.0,301); // 1 iter
|
||||
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
||||
|
||||
CoarseVector pm_src(CoarseMrhs);
|
||||
pm_src = ComplexD(1.0);
|
||||
PowerMethod<CoarseVector> cPM;
|
||||
cPM(MrhsCoarseOp,pm_src);
|
||||
|
||||
int Nk=192;
|
||||
int Nm=384;
|
||||
int Nstop=Nk;
|
||||
ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
|
||||
int Nconv_test_interval=1;
|
||||
|
||||
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
|
||||
Coarse5d,
|
||||
CoarseMrhs,
|
||||
nrhs,
|
||||
IRLCheby,
|
||||
Nstop,
|
||||
Nconv_test_interval,
|
||||
nrhs,
|
||||
Nk,
|
||||
Nm,
|
||||
1e-5,10);
|
||||
|
||||
int Nconv;
|
||||
std::vector<RealD> eval(Nm);
|
||||
std::vector<CoarseVector> evec(Nm,Coarse5d);
|
||||
CoarseVector c_src(Coarse5d);
|
||||
//c_src=1.0;
|
||||
random(CRNG,c_src);
|
||||
|
||||
CoarseVector c_res(Coarse5d);
|
||||
CoarseVector c_ref(Coarse5d);
|
||||
|
||||
PowerMethod<CoarseVector> cPM; cPM(CoarseOp,c_src);
|
||||
|
||||
IRL.calc(eval,evec,c_src,Nconv);
|
||||
DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
|
||||
std::vector<CoarseVector> c_src(nrhs,Coarse5d);
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Build a coarse space solver
|
||||
// Block projector for coarse/fine
|
||||
//////////////////////////////////////////
|
||||
int maxit=20000;
|
||||
ConjugateGradient<CoarseVector> CG(1.0e-8,maxit,false);
|
||||
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,10000,false);
|
||||
ZeroGuesser<CoarseVector> CoarseZeroGuesser;
|
||||
|
||||
// HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
|
||||
HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
|
||||
c_res=Zero();
|
||||
HPDSolve(c_src,c_res); c_ref = c_res;
|
||||
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
|
||||
std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Deflated (with real op EV's) solve for the projected coarse op
|
||||
// Work towards ADEF1 in the coarse space
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
|
||||
c_res=Zero();
|
||||
HPDSolveProj(c_src,c_res);
|
||||
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
|
||||
std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
|
||||
c_res = c_res - c_ref;
|
||||
std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Calling mRHS HDCG"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
|
||||
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
|
||||
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Coarse ADEF1 with deflation space
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
ChebyshevSmoother<CoarseVector,HermMatrix >
|
||||
CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence
|
||||
// CoarseSmoother(0.1,37.,8,CoarseOpProj); //
|
||||
// CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s
|
||||
// CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s
|
||||
// CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
|
||||
// CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
|
||||
// ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj); // 311
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// CG, Cheby mode spacing 200,200
|
||||
// Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
|
||||
// Unprojected Coarse CG solve to 4e-2 : 33 iters, 0.8s
|
||||
// Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
|
||||
////////////////////////////////////////////////////////
|
||||
// CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs
|
||||
////////////////////////////////////////////////////////
|
||||
// ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s 2.1x gain
|
||||
// ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
|
||||
// HDCG 38 iters 162s
|
||||
//
|
||||
// CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs
|
||||
// ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s 2.1x gain
|
||||
// ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
|
||||
// HDCG 38 iters 169s
|
||||
|
||||
TwoLevelADEF1defl<CoarseVector>
|
||||
cADEF1(1.0e-8, 500,
|
||||
CoarseOp,
|
||||
CoarseSmoother,
|
||||
evec,eval);
|
||||
|
||||
c_res=Zero();
|
||||
cADEF1(c_src,c_res);
|
||||
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
|
||||
std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
|
||||
c_res = c_res - c_ref;
|
||||
std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
|
||||
|
||||
// cADEF1.Tolerance = 4.0e-2;
|
||||
// cADEF1.Tolerance = 1.0e-1;
|
||||
cADEF1.Tolerance = 5.0e-2;
|
||||
c_res=Zero();
|
||||
cADEF1(c_src,c_res);
|
||||
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
|
||||
std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
|
||||
c_res = c_res - c_ref;
|
||||
std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Build a smoother
|
||||
//////////////////////////////////////////
|
||||
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
|
||||
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp); //383
|
||||
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp); //328
|
||||
// std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
|
||||
// std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
|
||||
// std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
|
||||
// std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
|
||||
|
||||
//
|
||||
// Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
|
||||
// Need to measure cost of coarse space.
|
||||
//
|
||||
// -- i) Reduce coarse residual -- 0.04
|
||||
// -- ii) Lanczos on coarse space -- done
|
||||
// -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
|
||||
// use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
|
||||
//
|
||||
std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
|
||||
|
||||
// std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
|
||||
std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
|
||||
|
||||
for(int l=0;l<los.size();l++){
|
||||
|
||||
RealD lo = los[l];
|
||||
|
||||
for(int o=0;o<ords.size();o++){
|
||||
|
||||
ConjugateGradient<CoarseVector> CGsloppy(4.0e-2,maxit,false);
|
||||
HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
|
||||
|
||||
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
|
||||
ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp); // 311
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Build a HDCG solver
|
||||
//////////////////////////////////////////
|
||||
TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
|
||||
HDCG(1.0e-8, 100,
|
||||
FineHermOp,
|
||||
Smoother,
|
||||
HPDSolveSloppy,
|
||||
HPDSolve,
|
||||
Aggregates);
|
||||
|
||||
TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
|
||||
HDCGdefl(1.0e-8, 100,
|
||||
FineHermOp,
|
||||
Smoother,
|
||||
cADEF1,
|
||||
HPDSolve,
|
||||
Aggregates);
|
||||
|
||||
result=Zero();
|
||||
HDCGdefl(src,result);
|
||||
|
||||
result=Zero();
|
||||
HDCG(src,result);
|
||||
|
||||
|
||||
}
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Recompute coarse evecs "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
evec.resize(Nm,Coarse5d);
|
||||
eval.resize(Nm);
|
||||
for(int r=0;r<nrhs;r++){
|
||||
random(CRNG,c_src[r]);
|
||||
}
|
||||
|
||||
// Standard CG
|
||||
result=Zero();
|
||||
CGfine(HermOpEO, src, result);
|
||||
IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
|
||||
|
||||
///////////////////////
|
||||
// Deflation guesser object
|
||||
///////////////////////
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Reimport coarse evecs "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
||||
MrhsGuesser.ImportEigenBasis(evec,eval);
|
||||
|
||||
//////////////////////////
|
||||
// Extra HDCG parameters
|
||||
//////////////////////////
|
||||
int maxit=3000;
|
||||
ConjugateGradient<CoarseVector> CG(5.0e-2,maxit,false);
|
||||
RealD lo=2.0;
|
||||
int ord = 7;
|
||||
|
||||
DoNothingGuesser<CoarseVector> DoNothing;
|
||||
HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Mirs smoother
|
||||
/////////////////////////////////////////////////
|
||||
RealD MirsShift = lo;
|
||||
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
|
||||
CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
|
||||
|
||||
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
|
||||
HDCGmrhs(1.0e-8, 500,
|
||||
FineHermOp,
|
||||
CGsmooth,
|
||||
HPDSolveMrhs, // Used in M1
|
||||
HPDSolveMrhs, // Used in Vstart
|
||||
MrhsProjector,
|
||||
MrhsGuesser,
|
||||
CoarseMrhs);
|
||||
|
||||
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
|
||||
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
|
||||
|
||||
for(int r=0;r<nrhs;r++){
|
||||
random(RNG5,src_mrhs[r]);
|
||||
res_mrhs[r]=Zero();
|
||||
}
|
||||
|
||||
HDCGmrhs(src_mrhs,res_mrhs);
|
||||
|
||||
// Standard CG
|
||||
#if 1
|
||||
{
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Calling red black CG"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
LatticeFermion result(FrbGrid); result=Zero();
|
||||
LatticeFermion src(FrbGrid); random(RNG5,src);
|
||||
result=Zero();
|
||||
|
||||
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
|
||||
CGfine(HermOpEO, src, result);
|
||||
}
|
||||
#endif
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
||||
|
@ -145,7 +145,7 @@ int main (int argc, char ** argv)
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
const int Ls=24;
|
||||
const int nbasis = 60;
|
||||
const int nbasis = 62;
|
||||
const int cb = 0 ;
|
||||
RealD mass=0.00078;
|
||||
RealD M5=1.8;
|
||||
@ -160,7 +160,7 @@ int main (int argc, char ** argv)
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
// Construct a coarsened grid with 4^4 cell
|
||||
Coordinate Block({4,4,4,4});
|
||||
Coordinate Block({4,4,6,4});
|
||||
Coordinate clatt = GridDefaultLatt();
|
||||
for(int d=0;d<clatt.size();d++){
|
||||
clatt[d] = clatt[d]/Block[d];
|
||||
|
396
tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
Normal file
396
tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
Normal file
@ -0,0 +1,396 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_general_coarse_hdcg.cc
|
||||
|
||||
Copyright (C) 2023
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class aggregation>
|
||||
void SaveBasis(aggregation &Agg,std::string file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacWriter WR(Agg.FineGrid->IsBoss());
|
||||
WR.open(file);
|
||||
for(int b=0;b<Agg.subspace.size();b++){
|
||||
WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||
// WR.writeScidacFieldRecord(Agg.subspace[b],record);
|
||||
}
|
||||
WR.close();
|
||||
#endif
|
||||
}
|
||||
template<class aggregation>
|
||||
void LoadBasis(aggregation &Agg, std::string file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacReader RD ;
|
||||
RD.open(file);
|
||||
for(int b=0;b<Agg.subspace.size();b++){
|
||||
RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
|
||||
}
|
||||
RD.close();
|
||||
#endif
|
||||
}
|
||||
template<class CoarseVector>
|
||||
void SaveEigenvectors(std::vector<RealD> &eval,
|
||||
std::vector<CoarseVector> &evec,
|
||||
std::string evec_file,
|
||||
std::string eval_file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
emptyUserRecord record;
|
||||
ScidacWriter WR(evec[0].Grid()->IsBoss());
|
||||
WR.open(evec_file);
|
||||
for(int b=0;b<evec.size();b++){
|
||||
WR.writeScidacFieldRecord(evec[b],record,0,0);
|
||||
}
|
||||
WR.close();
|
||||
XmlWriter WRx(eval_file);
|
||||
write(WRx,"evals",eval);
|
||||
#endif
|
||||
}
|
||||
template<class CoarseVector>
|
||||
void LoadEigenvectors(std::vector<RealD> &eval,
|
||||
std::vector<CoarseVector> &evec,
|
||||
std::string evec_file,
|
||||
std::string eval_file)
|
||||
{
|
||||
#ifdef HAVE_LIME
|
||||
XmlReader RDx(eval_file);
|
||||
read(RDx,"evals",eval);
|
||||
emptyUserRecord record;
|
||||
|
||||
Grid::ScidacReader RD ;
|
||||
RD.open(evec_file);
|
||||
assert(evec.size()==eval.size());
|
||||
for(int k=0;k<eval.size();k++) {
|
||||
RD.readScidacFieldRecord(evec[k],record);
|
||||
}
|
||||
RD.close();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Want Op in CoarsenOp to call MatPcDagMatPc
|
||||
template<class Field>
|
||||
class HermOpAdaptor : public LinearOperatorBase<Field>
|
||||
{
|
||||
LinearOperatorBase<Field> & wrapped;
|
||||
public:
|
||||
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
|
||||
void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); }
|
||||
void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); }
|
||||
void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); }
|
||||
void OpDiag (const Field &in, Field &out) { assert(0); }
|
||||
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
|
||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||
};
|
||||
|
||||
template<class Field> class CGSmoother : public LinearFunction<Field>
|
||||
{
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
typedef LinearOperatorBase<Field> FineOperator;
|
||||
FineOperator & _SmootherOperator;
|
||||
int iters;
|
||||
CGSmoother(int _iters, FineOperator &SmootherOperator) :
|
||||
_SmootherOperator(SmootherOperator),
|
||||
iters(_iters)
|
||||
{
|
||||
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
|
||||
};
|
||||
void operator() (const Field &in, Field &out)
|
||||
{
|
||||
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
|
||||
|
||||
out=Zero();
|
||||
|
||||
CG(_SmootherOperator,in,out);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
const int Ls=24;
|
||||
const int nbasis = 60;
|
||||
const int cb = 0 ;
|
||||
RealD mass=0.00078;
|
||||
RealD M5=1.8;
|
||||
RealD b=1.5;
|
||||
RealD c=0.5;
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
// Construct a coarsened grid with 4^4 cell
|
||||
// Coordinate Block({4,4,6,4});
|
||||
Coordinate Block({4,4,4,4});
|
||||
Coordinate clatt = GridDefaultLatt();
|
||||
for(int d=0;d<clatt.size();d++){
|
||||
clatt[d] = clatt[d]/Block[d];
|
||||
}
|
||||
|
||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());;
|
||||
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
||||
|
||||
///////////////////////// RNGs /////////////////////////////////
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
std::vector<int> cseeds({5,6,7,8});
|
||||
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
|
||||
|
||||
///////////////////////// Configuration /////////////////////////////////
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
|
||||
FieldMetaData header;
|
||||
std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250");
|
||||
NerscIO::readConfiguration(Umu,header,file);
|
||||
|
||||
//////////////////////// Fermion action //////////////////////////////////
|
||||
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
|
||||
|
||||
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
|
||||
|
||||
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
|
||||
HermFineMatrix FineHermOp(HermOpEO);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
///////////// Coarse basis and Little Dirac Operator ///////
|
||||
////////////////////////////////////////////////////////////
|
||||
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
|
||||
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
||||
|
||||
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
|
||||
|
||||
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
||||
Subspace Aggregates(Coarse5d,FrbGrid,cb);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Need to check about red-black grid coarsening
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
// std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60");
|
||||
std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60");
|
||||
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2");
|
||||
std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60");
|
||||
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
|
||||
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
|
||||
bool load_agg=true;
|
||||
bool load_refine=true;
|
||||
bool load_mat=false;
|
||||
bool load_evec=false;
|
||||
|
||||
int refine=1;
|
||||
if ( load_agg ) {
|
||||
if ( !(refine) || (!load_refine) ) {
|
||||
LoadBasis(Aggregates,subspace_file);
|
||||
}
|
||||
} else {
|
||||
Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);
|
||||
SaveBasis(Aggregates,subspace_file);
|
||||
}
|
||||
|
||||
if ( load_refine ) {
|
||||
std::cout << " Load Refine "<< refine_file <<std::endl;
|
||||
LoadBasis(Aggregates,refine_file);
|
||||
} else {
|
||||
Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters
|
||||
// Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters
|
||||
SaveBasis(Aggregates,refine_file);
|
||||
}
|
||||
Aggregates.Orthogonalise();
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Building MultiRHS Coarse operator"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
const int nrhs=12;
|
||||
|
||||
Coordinate mpi=GridDefaultMpi();
|
||||
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
||||
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
|
||||
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
|
||||
|
||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||
typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
|
||||
MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
|
||||
|
||||
///////////////////////
|
||||
// Deflation guesser object
|
||||
///////////////////////
|
||||
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Block projector for coarse/fine
|
||||
//////////////////////////////////////////
|
||||
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Coarsen after refine"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Coarse Lanczos "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
||||
// Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,301); // 4.4.6.4
|
||||
// Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,501); // for 4.4.4.4 blocking 350 evs
|
||||
// Chebyshev<CoarseVector> IRLCheby(0.0014,42.0,501); // for 4.4.4.4 blocking 700 evs
|
||||
// Chebyshev<CoarseVector> IRLCheby(0.002,42.0,501); // for 4.4.4.4 blocking 1226 evs
|
||||
// Chebyshev<CoarseVector> IRLCheby(0.0025,42.0,501); // for 4.4.4.4 blocking 1059 evs
|
||||
// 3e-4,2);
|
||||
Chebyshev<CoarseVector> IRLCheby(0.0018,42.0,301); // for 4.4.4.4 blocking // 790 evs
|
||||
|
||||
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
||||
|
||||
CoarseVector pm_src(CoarseMrhs);
|
||||
pm_src = ComplexD(1.0);
|
||||
PowerMethod<CoarseVector> cPM; cPM(MrhsCoarseOp,pm_src);
|
||||
|
||||
// int Nk=nrhs*30; // 4.4.6.4
|
||||
// int Nk=nrhs*80;
|
||||
int Nk=nrhs*60; // 720
|
||||
int Nm=Nk*4; // 2880 ; generally finishes at 1440
|
||||
int Nstop=512;
|
||||
int Nconv_test_interval=1;
|
||||
|
||||
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
|
||||
Coarse5d,
|
||||
CoarseMrhs,
|
||||
nrhs,
|
||||
IRLCheby,
|
||||
Nstop,
|
||||
Nconv_test_interval,
|
||||
nrhs,
|
||||
Nk,
|
||||
Nm,
|
||||
3e-4,2);
|
||||
|
||||
std::vector<RealD> eval(Nm);
|
||||
std::vector<CoarseVector> evec(Nm,Coarse5d);
|
||||
std::vector<CoarseVector> c_src(nrhs,Coarse5d);
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " Recompute coarse evecs "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
evec.resize(Nm,Coarse5d);
|
||||
eval.resize(Nm);
|
||||
for(int r=0;r<nrhs;r++){
|
||||
random(CRNG,c_src[r]);
|
||||
}
|
||||
int Nconv;
|
||||
IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl);
|
||||
Nconv = eval.size();
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << " import coarse evecs "<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
MrhsGuesser.ImportEigenBasis(evec,eval);
|
||||
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Calling mRHS HDCG"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
|
||||
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||
|
||||
//////////////////////////
|
||||
// Extra HDCG parameters
|
||||
//////////////////////////
|
||||
int maxit=3000;
|
||||
ConjugateGradient<CoarseVector> CG(7.5e-2,maxit,false);
|
||||
RealD lo=2.0;
|
||||
int ord = 7;
|
||||
|
||||
DoNothingGuesser<CoarseVector> DoNothing;
|
||||
HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
|
||||
HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing);
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Mirs smoother
|
||||
/////////////////////////////////////////////////
|
||||
RealD MirsShift = lo;
|
||||
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
|
||||
CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
|
||||
|
||||
|
||||
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
|
||||
HDCGmrhs(1.0e-8, 500,
|
||||
FineHermOp,
|
||||
CGsmooth,
|
||||
HPDSolveMrhs, // Used in M1
|
||||
HPDSolveMrhs, // Used in Vstart
|
||||
MrhsProjector,
|
||||
MrhsGuesser,
|
||||
CoarseMrhs);
|
||||
|
||||
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
|
||||
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
|
||||
|
||||
for(int r=0;r<nrhs;r++){
|
||||
random(RNG5,src_mrhs[r]);
|
||||
res_mrhs[r]=Zero();
|
||||
}
|
||||
|
||||
HDCGmrhs(src_mrhs,res_mrhs);
|
||||
|
||||
// Standard CG
|
||||
#if 0
|
||||
{
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
std::cout << "Calling red black CG"<<std::endl;
|
||||
std::cout << "**************************************"<<std::endl;
|
||||
|
||||
LatticeFermion result(FrbGrid); result=Zero();
|
||||
LatticeFermion src(FrbGrid); random(RNG5,src);
|
||||
result=Zero();
|
||||
|
||||
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
|
||||
CGfine(HermOpEO, src, result);
|
||||
}
|
||||
#endif
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user