1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Compare commits

...

60 Commits

Author SHA1 Message Date
bffd30abec Optimise lie algebra project 2024-09-19 15:48:09 -04:00
da919949f9 Clean up the accelerator pick/set checkerboard 2024-08-23 12:34:41 -04:00
b12b4fdaff Attempt at operating on half checkerboard 2024-08-23 11:05:09 -04:00
557fa483ff Blas benchmark committed stand alone 2024-08-20 16:18:43 +00:00
fc15d55df6 Mallinfo 2024-08-20 14:33:09 +00:00
53573d7d94 Better benchmark 2024-08-20 14:31:57 +00:00
bb3c177000 Better benchmarking 2024-08-20 14:31:41 +00:00
a3322b470f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-20 14:30:52 +00:00
f8f408e7a9 BLAS everywhere 2024-07-25 18:09:02 +00:00
baac1127d0 Later intel compiler happiness 2024-07-25 18:06:05 +00:00
6f1328160c Remove SVM use 2024-07-25 18:05:40 +00:00
04cf902791 Mallinfo and ASAN hooks 2024-07-25 18:04:56 +00:00
7a5b1c1a19 Try Catch convenience macro 2024-07-25 18:03:41 +00:00
18d2d7da4a Eigen implementation and SYCL implementation 2024-07-25 18:02:56 +00:00
b461184797 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-23 09:53:58 -04:00
4563b39305 New Frontier config 2024-07-23 09:53:08 -04:00
c9d5674d5b FInal for paper 2024-07-22 15:26:45 -04:00
486412635a 8^4 test for PETSc 2024-07-22 15:25:17 -04:00
8b23a1546a Force compile temporarily 2024-07-22 15:24:56 -04:00
a901e4e369 Regressed performance for paper 2024-07-22 15:24:04 -04:00
804d9367d4 Regressed performance 2024-07-22 15:23:25 -04:00
41d8adca95 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-11 15:38:45 +00:00
059e8e5bb0 New compile option 2024-07-11 15:37:30 +00:00
b3ee8ded96 Respect command line 2024-07-11 15:34:48 +00:00
cf3584ad15 Convenient to monitor memory across an HMC trajectory 2024-07-11 15:30:32 +00:00
a66973163f Device vector not UVM 2024-07-11 15:24:11 +00:00
4502a8c8a1 libc malloc heap info dump on Linux 2024-07-11 15:22:18 +00:00
9c902e4c2d Batched blas, but not working yet on OneAPI 2024-07-11 15:19:49 +00:00
f3eb36adcf Namespace addition 2024-07-11 15:19:19 +00:00
7c246606c1 Schur additional case 2024-07-10 22:04:32 +00:00
172c75029e Redblack additional case 2024-07-10 22:03:59 +00:00
6ae52da571 LLVM leak sanitizer 2024-07-08 15:59:18 +00:00
4ee9c68053 Updated compile environment 2024-07-08 15:57:57 +00:00
a15b4378a3 Sanitizer preservation of options 2024-07-08 15:57:45 +00:00
89fdd7f8dd AOT compilation 2024-07-05 17:47:56 +00:00
c328be24b7 Sanitizer compile options 2024-07-05 17:46:43 +00:00
a73dc6dbf4 Display linux heap info 2024-06-28 16:05:17 +00:00
eee2a2657f Try catch exception wrappers 2024-06-28 16:02:29 +00:00
12b8be7cb9 Best so far on 96^3 350 Evecs converged on 4^4 block 2024-06-18 16:31:37 -04:00
63c223ea5d Verbose 2024-06-18 03:22:01 +00:00
2877fb4a2c More verbose if alloc failure 2024-06-18 03:21:03 +00:00
d299c86633 Std::asin,acos 2024-06-11 16:41:23 -04:00
6ce52092e8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 15:16:58 -04:00
b5926c1d21 Broadcast time info 2024-06-11 15:16:25 -04:00
9563238e9b Force initial to identity 2024-06-11 17:51:58 +00:00
fb9b1d76ca Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 16:48:16 +00:00
1739146599 Property to initialise reduction 2024-06-11 16:47:35 +00:00
ed20b39ab3 Log files from Frontier benchmark 2024-06-11 11:16:20 -04:00
284fc05f15 Protect vs. missing LIME libarary 2024-06-11 11:08:00 -04:00
07a07b6fa3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-10 15:09:25 -04:00
dc80b08969 96^3 test 2024-06-10 15:07:29 -04:00
a49a161f8d SYCL update to use buffer on reduction variable 2024-06-08 16:05:18 +00:00
a6479ca50f Shuhei's ComputeWilsonFlow main programme 2024-06-05 15:51:11 -04:00
0e607a55e7 Updated for 8^4 test 2024-05-26 20:53:05 +00:00
c4b9f71357 CPU compile ordering is important 2024-05-21 02:22:32 +01:00
394e506aea Compile options for tursa update 2024-05-21 02:10:04 +01:00
e19b26341b Tursa configure update 2024-05-21 01:14:27 +01:00
cfe1b13225 Back out zero change 2024-05-21 01:14:08 +01:00
890c5ea1cd Warning disable 2024-05-20 20:08:31 +01:00
a87378d3b6 Update 2024-05-20 20:08:31 +01:00
45 changed files with 3589 additions and 665 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench

View File

@ -30,9 +30,14 @@ directory
#include <type_traits>
#include <cassert>
#include <exception>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@ -89,9 +89,10 @@ public:
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
cl::sycl::cpu_selector selector;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
gridblasHandle =new sycl::queue (selectedDevice);
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif
gridblasInit=1;
}
@ -207,6 +208,9 @@ public:
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -266,26 +270,130 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexD *) &alpha_p[0],
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexD *) &beta_p[0],
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#if 0
// This code was used to check the mat mul on Sunspot/OneMKL
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
std::vector<ComplexD> A(m*k); // pointer list to matrices
std::vector<ComplexD> B(k*n);
std::vector<ComplexD> C(m*n);
// int sda = lda*k;
// int sdb = ldb*k;
// int sdc = ldc*n;
std::cerr << " Checking the GEMM results "<<std::endl;
for (int p = 0; p < 1; ++p) {
ComplexD * Amk_p; // pointer list to matrices
ComplexD * Bkn_p; // pointer list to matrices
ComplexD * Cmn_p; // pointer list to matrices
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
std::cerr << " p " << p << " copied pointers "<<std::endl;
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
std::cerr << " p " << p << " copied matrices "<<std::endl;
std::cerr << " C[0] "<<C[0]<<std::endl;
std::cerr << " A[0] "<<A[0]<<std::endl;
std::cerr << " B[0] "<<B[0]<<std::endl;
std::cerr << " m "<<m<<std::endl;
std::cerr << " n "<<n<<std::endl;
std::cerr << " k "<<k<<std::endl;
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk) {
int idx_a, idx_b;
// int lda = m; // m x k column major
// int ldb = k; // k x n column major
// int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N) {
idx_a =kk + mm*lda;
} else {
idx_a =mm + kk*lda;
}
if(OpB!=GridBLAS_OP_N) {
idx_b =nn + kk*ldb;
} else {
idx_b =kk + nn*ldb;
}
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
ComplexD Ac = A[idx_a];
ComplexD Bc = B[idx_b];
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
c_mn += Ac*Bc;
}
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
}
}
}
}
#endif
// synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@ -306,6 +414,9 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -366,26 +477,69 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
ComplexF alphaf(real(alpha),imag(alpha));
ComplexF betaf(real(beta),imag(beta));
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexF c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
}
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else {
assert(0);
}
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
@ -408,6 +562,9 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -467,24 +624,69 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
@ -495,7 +697,6 @@ public:
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
@ -508,6 +709,9 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -568,160 +772,124 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
/*
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
onemkl::transpose::N,
onemkl::transpose::N,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
1,&batchCount64);
*/
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(const double **)&Amk[0], (const int64_t *)&lda64,
(const double **)&Bkn[0], (const int64_t *)&ldb64,
(double *) &beta_p[0],
(double **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Strided case used by benchmark, but generally unused in Grid
// Keep a code example in double complex, but don't generate the single and real variants for now
////////////////////////////////////////////////////////////////////////////////////////////////
void gemmStridedBatched(int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // pointer list to matrices
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn,
int batchCount)
{
// Use C-row major storage, so transpose calls
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
int sda = m*k;
int sdb = k*n;
int sdc = m*n;
deviceVector<ComplexD> alpha_p(1);
deviceVector<ComplexD> beta_p(1);
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
#ifdef GRID_HIP
auto err = hipblasZgemmStridedBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda, sda,
(hipblasDoubleComplex *) Bkn, ldb, sdb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc, sdc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasZgemmStridedBatched(gridblasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda, sda,
(cuDoubleComplex *) Bkn, ldb, sdb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc, sdc,
batchCount);
#endif
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
oneapi::mkl::transpose::N,
oneapi::mkl::transpose::N,
m,n,k,
alpha,
(const ComplexD *)Amk,lda,sda,
(const ComplexD *)Bkn,ldb,sdb,
beta,
(ComplexD *)Cmn,ldc,sdc,
batchCount);
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
}
}
}
#endif
}
template<class CComplex>
double benchmark(int M, int N, int K, int BATCH)
{
int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH;
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
ComplexD alpha(1.0);
ComplexD beta (1.0);
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
CComplex alpha(1.0);
CComplex beta (1.0);
RealD flops = 8.0*M*N*K*BATCH;
int ncall=10;
int ncall=1000;
deviceVector<CComplex *> As(BATCH);
deviceVector<CComplex *> Bs(BATCH);
deviceVector<CComplex *> Cs(BATCH);
for(int b = 0 ; b < BATCH;b++) {
CComplex *ptr;
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
}
// Warm up call
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemmStridedBatched(M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0], // m x n
BATCH);
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
}
synchronise();
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
};
NAMESPACE_END(Grid);

View File

@ -279,11 +279,11 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
_sort.push(eval2,Nm);
// Glog << "#Ritz value before shift: "<< std::endl;
Glog << "#Ritz value before shift: "<< std::endl;
for(int i=0; i<Nm; ++i){
// std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
std::cout.precision(13);
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
}
//----------------------------------------------------------------------
@ -297,7 +297,8 @@ public:
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
for(int ip=Nk; ip<Nm; ++ip){
for(int ip=Nk; ip<Nm; ++ip){
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
}
@ -325,7 +326,7 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
_sort.push(eval2,Nk);
// Glog << "#Ritz value after shift: "<< std::endl;
Glog << "#Ritz value after shift: "<< std::endl;
for(int i=0; i<Nk; ++i){
// std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@ -467,10 +468,10 @@ public:
// set initial vector
for (int i=0; i<Nu; ++i) {
// Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
evec[i] = src[i];
orthogonalize(evec[i],evec,i);
// Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
}
// exit(-43);
@ -506,11 +507,11 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
_sort.push(eval2,Nr);
// Glog << "#Ritz value: "<< std::endl;
Glog << "#Ritz value: "<< std::endl;
for(int i=0; i<Nr; ++i){
// std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
std::cout.precision(13);
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
}
// Convergence test
@ -570,6 +571,7 @@ public:
Glog << fname + " NOT converged ; Summary :\n";
} else {
Glog << fname + " CONVERGED ; Summary :\n";
Nstop = Nconv_guess; // Just take them all
// Sort convered eigenpairs.
std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
@ -642,7 +644,7 @@ private:
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
k_start +=mrhs;
}
// Glog << "LinAlg "<< std::endl;
Glog << "LinAlg "<< std::endl;
if (b>0) {
for (int u=0; u<Nu; ++u) {
@ -676,7 +678,7 @@ private:
}
w_copy[u] = w[u];
}
// Glog << "LinAlg done"<< std::endl;
Glog << "LinAlg done"<< std::endl;
// In block version, the steps 6 and 7 in Lanczos construction is
// replaced by the QR decomposition of new basis block.
@ -689,15 +691,15 @@ private:
}
// re-orthogonalization for numerical stability
// Glog << "Gram Schmidt"<< std::endl;
Glog << "Gram Schmidt"<< std::endl;
orthogonalize(w,Nu,evec,R);
// QR part
for (int u=1; u<Nu; ++u) {
orthogonalize(w[u],w,u);
}
// Glog << "Gram Schmidt done "<< std::endl;
Glog << "Gram Schmidt done "<< std::endl;
// Glog << "LinAlg "<< std::endl;
Glog << "LinAlg "<< std::endl;
for (int u=0; u<Nu; ++u) {
//for (int v=0; v<Nu; ++v) {
for (int v=u; v<Nu; ++v) {
@ -714,7 +716,7 @@ private:
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
}
}
// Glog << "LinAlg done "<< std::endl;
Glog << "LinAlg done "<< std::endl;
if (b < Nm/Nu-1) {
for (int u=0; u<Nu; ++u) {
@ -779,7 +781,7 @@ private:
for ( int u=0; u<Nu; ++u ) {
for (int k=0; k<Nk; ++k ) {
// Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
// Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
}
}
@ -933,7 +935,7 @@ if (1){
int Nu, int Nb, int Nk, int Nm,
Eigen::MatrixXcd& M)
{
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk);
@ -951,7 +953,7 @@ if (1){
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
}
}
//Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
}
@ -961,7 +963,7 @@ if (1){
int Nu, int Nb, int Nk, int Nm,
Eigen::MatrixXcd& M)
{
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
@ -977,7 +979,7 @@ if (1){
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
}
}
//Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
}
@ -986,7 +988,7 @@ if (1){
RealD Dsh,
Eigen::MatrixXcd& Qprod)
{
//Glog << "shiftedQRDecompEigen() begin" << '\n';
Glog << "shiftedQRDecompEigen() begin" << '\n';
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@ -1002,6 +1004,7 @@ if (1){
// lower triangular part used to represent series
// of Q sequence.
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
// equivalent operation of Qprod *= Q
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
@ -1022,6 +1025,7 @@ if (1){
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
for (int i=0; i<Nm; ++i) {
for (int j=0; j<Nm-(Nu+1); ++j) {
for (int k=0; k<Nu+1+j; ++k) {
@ -1029,6 +1033,7 @@ if (1){
}
}
}
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
for (int i=0; i<Nm; ++i) {
for (int j=Nm-(Nu+1); j<Nm; ++j) {
for (int k=0; k<Nm; ++k) {
@ -1036,6 +1041,7 @@ if (1){
}
}
}
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
//static int ntimes = 2;
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@ -1061,11 +1067,13 @@ if (1){
Mtmp(j,i) = conj(Mtmp(i,j));
}
}
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
for (int i=0; i<Nm; ++i) {
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
}
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
M = Mtmp;
//M = Q.adjoint()*(M*Q);
@ -1077,7 +1085,7 @@ if (1){
// }
//}
//Glog << "shiftedQRDecompEigen() end" << endl;
Glog << "shiftedQRDecompEigen() end" <<std::endl;
}
void exampleQRDecompEigen(void)

View File

@ -499,6 +499,87 @@ namespace Grid {
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta

View File

@ -54,6 +54,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@ -100,6 +103,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@ -145,6 +151,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}

View File

@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
#ifdef GRID_CUDA
cuda_mem();
#endif
DisplayMallinfo();
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }

View File

@ -211,6 +211,7 @@ private:
#endif
public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr);
static void Print(void);
static void PrintAll(void);

View File

@ -91,6 +91,7 @@ public:
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerDim(void){ return 0; };
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;

View File

@ -60,6 +60,7 @@ public:
int _checker_dim;
std::vector<int> _checker_board;
virtual int CheckerDim(void){ return _checker_dim; };
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;

View File

@ -236,11 +236,18 @@ public:
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 1
auto me = View(CpuWrite);
thread_for(ss,me.size(),{
me[ss]= r;
});
#else
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(vtmp);
coalescedWrite(me[ss],stmp);
});
#endif
me.ViewClose();
return *this;
}

View File

@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites();
// Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
});
}
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
deviceVector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
});
}
#endif
// This is in single precision and fails some tests
auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm;
@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{

View File

@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
static Vector<sobj> mysum;
mysum.resize(1);
sobj *mysum_p = & mysum[0];
sobj identity; zeroit(identity);
mysum[0] = identity;
sobj ret ;
Integer nsimd= vobj::Nsimd();
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
// free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
Word xorResult; xorResult = 0;
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
static Vector<Word> d_sum;
d_sum.resize(1);
Word *d_sum_p=&d_sum[0];
Word identity; identity=0;
d_sum[0] = identity;
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum ^=vec[index];
sum^=vec[index];
});
});
theGridAccelerator->wait();
Word ret = d_sum[0];
free(d_sum,*theGridAccelerator);
// free(d_sum,*theGridAccelerator);
return ret;
}

View File

@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{
half.Checkerboard() = cb;
autoView( half_v, half, CpuWrite);
autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{
int cbos;
Coordinate coor;
full.Grid()->oCoorFromOindex(coor,ss);
cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) {
int ssh=half.Grid()->oIndex(coor);
half_v[ssh] = full_v[ss];
}
});
acceleratorPickCheckerboard(cb,half,full);
}
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{
int cb = half.Checkerboard();
autoView( half_v , half, CpuRead);
autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
int cbos;
full.Grid()->oCoorFromOindex(coor,ss);
cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) {
int ssh=half.Grid()->oIndex(coor);
full_v[ss]=half_v[ssh];
}
});
acceleratorSetCheckerboard(full,half);
}
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
{
half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite);
@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
int checker_dim_half = half.Grid()->CheckerDim();
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
}
});
}
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
{
int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead);
@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
int checker_dim_half = half.Grid()->CheckerDim();
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;

View File

@ -90,6 +90,7 @@ public:
exit(1);
}
Parameters.StartingType = arg;
std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
}
if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
@ -97,6 +98,7 @@ public:
std::vector<int> ivec(0);
GridCmdOptionIntVector(arg, ivec);
Parameters.StartTrajectory = ivec[0];
std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
}
if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
@ -104,6 +106,7 @@ public:
std::vector<int> ivec(0);
GridCmdOptionIntVector(arg, ivec);
Parameters.Trajectories = ivec[0];
std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
}
if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
@ -111,6 +114,7 @@ public:
std::vector<int> ivec(0);
GridCmdOptionIntVector(arg, ivec);
Parameters.NoMetropolisUntil = ivec[0];
std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
}
if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");

View File

@ -137,9 +137,11 @@ public:
double start_force = usecond();
MemoryManager::Print();
as[level].actions.at(a)->deriv_timer_start();
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv_timer_stop();
MemoryManager::Print();
auto name = as[level].actions.at(a)->action_name();
@ -246,7 +248,11 @@ public:
}
};
virtual ~Integrator() {}
virtual ~Integrator()
{
// Pain in the ass to clean up the Level pointers
// Guido's design is at fault as per comment above in constructor
}
virtual std::string integrator_name() = 0;
@ -460,6 +466,7 @@ public:
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
MemoryManager::Print();
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
@ -468,6 +475,7 @@ public:
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;
MemoryManager::Print();
}
as[level].apply(S_hireps, Representations, level, H);

View File

@ -32,7 +32,9 @@ private:
// Smear_Stout<Gimpl> *StoutSmearing;
// std::vector<GaugeField> SmearedSet;
GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
std::vector<LatticeLorentzComplex> masks;
std::vector<int> cbs;
typedef typename SU3Adjoint::AMatrix AdjMatrix;
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
@ -147,6 +149,25 @@ private:
}
pokeLorentz(Fdet, Fdet_pol, nu);
}
void Compute_MpInvJx_dNxxdSy(int cb,
const GaugeLinkField &PlaqL,
const GaugeLinkField &PlaqR,
AdjMatrixField MpInvJx,
AdjVectorField &Fdet2 )
{
GaugeLinkField PlaqLeo(UrbGrid);
GaugeLinkField PlaqReo(UrbGrid);
AdjMatrixField MpInvJxeo(UrbGrid);
AdjVectorField Fdet2eo(UrbGrid);
pickCheckerboard(cb,PlaqLeo,PlaqL);
pickCheckerboard(cb,PlaqReo,PlaqR);
pickCheckerboard(cb,MpInvJxeo,MpInvJx);
Fdet2eo.Checkerboard()=cb;
Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
setCheckerboard(Fdet2,Fdet2eo);
}
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
{
GaugeLinkField UtaU(PlaqL.Grid());
@ -278,8 +299,9 @@ public:
////////////////////////////////////////////////////////////////////////////////
// Mask the gauge field
////////////////////////////////////////////////////////////////////////////////
int cb = cbs[smr];
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
Umsk = U;
ApplyMask(Umsk,smr);
Utmp = peekLorentz(Umsk,mu);
@ -442,7 +464,7 @@ public:
AdjMatrixField MpInvJx_nu(grid);
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
Fdet2_mu=FdetV;
Fdet1_mu=Zero();
@ -499,7 +521,7 @@ public:
time=-usecond();
PlaqR=(-1.0)*PlaqR;
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
Fdet2_nu = FdetV;
time+=usecond();
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
@ -520,7 +542,7 @@ public:
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_nu = Fdet2_nu+FdetV;
///////////////// -ve nu /////////////////
@ -539,7 +561,7 @@ public:
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
MpInvJx_nu = Cshift(MpInvJx,nu,1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_nu = Fdet2_nu+FdetV;
// x==
@ -560,7 +582,7 @@ public:
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_nu = Fdet2_nu+FdetV;
/////////////////////////////////////////////////////////////////////
@ -589,7 +611,7 @@ public:
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_mu = Fdet2_mu+FdetV;
// __
@ -609,7 +631,7 @@ public:
MpInvJx_nu = Cshift(MpInvJx,nu,1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_mu = Fdet2_mu+FdetV;
}
@ -931,6 +953,10 @@ private:
public:
/* Standard constructor */
virtual ~SmearedConfigurationMasked()
{
delete UrbGrid;
}
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
{
@ -939,7 +965,6 @@ public:
// was resized in base class
assert(this->SmearedSet.size()==Nsmear);
GridRedBlackCartesian * UrbGrid;
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
LatticeComplex tmp(_UGrid);
@ -947,10 +972,11 @@ public:
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
int mu= (i/2) %Nd;
int cb= (i%2);
LatticeComplex tmpcb(UrbGrid);
cbs.push_back(cb);
masks[i]=Zero();
////////////////////
@ -962,7 +988,6 @@ public:
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
}
delete UrbGrid;
}
virtual void smeared_force(GaugeField &SigmaTilde)

View File

@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
int hNNm1= NNm1/2;
RealD sqrt_2 = sqrt(2.0);
Complex ci(0.0,1.0);
for(int su2Index=0;su2Index<hNNm1;su2Index++){
int i1, i2;
su2SubGroupIndex(i1, i2, su2Index);
int ax = su2Index*2;
int ay = su2Index*2+1;
accelerator_for(ss,grid->oSites(),1,{
const int nsimd= Matrix::Nsimd();
accelerator_for(ss,grid->oSites(),nsimd,{
for(int su2Index=0;su2Index<hNNm1;su2Index++){
int i1, i2;
su2SubGroupIndex(i1, i2, su2Index);
int ax = su2Index*2;
int ay = su2Index*2+1;
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
// trace( Ta x Ci in)
// Bet I need to move to real part with mult by -i
out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
});
}
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
int k = diagIndex + 1; // diagIndex starts from 0
int a = NNm1+diagIndex;
RealD scale = 1.0/sqrt(2.0*k*(k+1));
accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
auto tmp = in_v[ss]()()(0,0);
coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
}
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
int k = diagIndex + 1; // diagIndex starts from 0
int a = NNm1+diagIndex;
RealD scale = 1.0/sqrt(2.0*k*(k+1));
auto tmp = in_v(ss)()()(0,0);
for(int i=1;i<k;i++){
tmp=tmp+in_v[ss]()()(i,i);
tmp=tmp+in_v(ss)()()(i,i);
}
tmp = tmp - in_v[ss]()()(k,k)*k;
out_v[ss]()()(a,b) =imag(tmp) * scale;
});
}
tmp = tmp - in_v(ss)()()(k,k)*k;
coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
}
});
}

View File

@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
////////////////////////////////////////////////////////////////////////
// Map a su2 subgroup number to the pair of rows that are non zero
////////////////////////////////////////////////////////////////////////
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
int spare = su2_index;

View File

@ -99,6 +99,8 @@ using std::log;
using std::exp;
using std::sin;
using std::cos;
using std::asin;
using std::acos;
accelerator_inline RealF conjugate(const RealF & r){ return r; }

View File

@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {
NAMESPACE_END(Grid);
#ifdef GRID_SYCL
template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
#endif

View File

@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
#endif
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
{
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
acceleratorCopySynchronise();
}
//////////////////////////////////////////////
// CPU Target - No accelerator just thread instead
//////////////////////////////////////////////
@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
#undef GRID_SIMT
inline void acceleratorMem(void)
{
/*
@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void)
return;
}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
{
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
acceleratorCopySynchronise();
}
template<class T> void acceleratorPut(T& dev,T&host)
{
acceleratorCopyToDevice(&host,&dev,sizeof(T));

238
HMC/ComputeWilsonFlow.cc Normal file
View File

@ -0,0 +1,238 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: HMC/ComputeWilsonFlow.cc
Copyright (C) 2017
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Shuhei Yamamoto <syamamoto@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <string>
namespace Grid{
struct WFParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
int, steps,
double, step_size,
int, meas_interval,
double, maxTau, // for the adaptive algorithm
int, meas_interval_density,
std::string, path);
template <class ReaderClass >
WFParameters(Reader<ReaderClass>& Reader){
read(Reader, "WilsonFlow", *this);
}
};
struct ConfParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters,
std::string, conf_path,
std::string, conf_prefix,
std::string, conf_smr_prefix,
std::string, rng_prefix,
int, StartConfiguration,
int, EndConfiguration,
int, Skip);
template <class ReaderClass >
ConfParameters(Reader<ReaderClass>& Reader){
read(Reader, "Configurations", *this);
}
};
}
template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.close();
#endif
// What is the appropriate way to throw error?
}
int main(int argc, char **argv) {
using namespace Grid;
Grid_init(&argc, &argv);
GridLogLayout();
auto latt_size = GridDefaultLatt();
auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
auto mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
std::vector<int> seeds({1, 2, 3, 4, 5});
GridSerialRNG sRNG;
GridParallelRNG pRNG(&Grid);
pRNG.SeedFixedIntegers(seeds);
LatticeGaugeField Umu(&Grid), Uflow(&Grid);
typedef Grid::XmlReader Serialiser;
Serialiser Reader("input.xml", false, "root");
WFParameters WFPar(Reader);
ConfParameters CPar(Reader);
CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix);
NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar);
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
std::cout << std::setprecision(15);
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
std::string file_pre = WFPar.path;
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
typedef typename PeriodicGimplR::ComplexField ComplexField;
assert(Nd == 4);
// NOTE:
// Ideally, turn the folloing into methods of the appropriate class
///////////// Compute Energy Density via Clover Leaf /////////////////////////////////////////////////
///// Taken from qcd/smearing/WilsonFlow.h
// For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3
//E = 1/2 tr( F_munu F_munu )
//However as F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
//F_01 F_02 F_03 F_12 F_13 F_23
GaugeMat F(U.Grid());
//LatticeComplexD R(U.Grid());
ComplexField R(U.Grid());
R = Zero();
for(int mu=0;mu<3;mu++){
for(int nu=mu+1;nu<4;nu++){
WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu);
R = R + trace(F*F);
}
}
R = (-1.0) * R;
//// Taken from qcd/utils/WilsonLoops.h
// Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir);
WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir);
WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir);
// Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir);
WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir);
WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir);
double coeff = 8.0/(32.0*M_PI*M_PI);
ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
//ComplexField qfield Plq(U.Grid());
//WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U);
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
//Plq = coeff * Plq;
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(qfield,tfile);
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
RealD T = real( sum(qfield) );
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
RealD E0 = real(peekSite(R,scoor));
RealD T0 = real(peekSite(qfield,scoor));
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
});
int t=WFPar.maxTau;
WF.smear(Uflow, Umu);
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout<< GridLogMessage << " Admissibility check:\n";
const double sp_adm = 0.067; // admissible threshold
const double pl_adm = 1.0-sp_adm/Nc;
std::cout << GridLogMessage << " (pl_adm =" << pl_adm << ")\n";
// Need min and reduce min for this function
//double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm));
double sp_ave = Nc*(1.0-WFlow_plaq);
//std::cout<< GridLogMessage << " sp_max = " << sp_max <<"\n";
std::cout<< GridLogMessage << " sp_ave = " << sp_ave <<"\n";
std::cout<< GridLogMessage << " (sp_admissible = "<< sp_adm <<")\n";
//std::cout<< GridLogMessage << " sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n";
std::cout<< GridLogMessage << " sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n";
}
Grid_finalize();
} // main
/*
Input file example
JSON
{
"WilsonFlow":{
"steps": 200,
"step_size": 0.01,
"meas_interval": 50,
"maxTau": 2.0
},
"Configurations":{
"conf_prefix": "ckpoint_lat",
"rng_prefix": "ckpoint_rng",
"StartConfiguration": 3000,
"EndConfiguration": 3000,
"Skip": 5
}
}
*/

View File

@ -58,7 +58,7 @@ int main(int argc, char **argv) {
HMCparameters HMCparams;
HMCparams.StartTrajectory = 0;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 20;
HMCparams.NoMetropolisUntil= 0;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
HMCparams.StartingType =std::string("ColdStart");
HMCparams.MD = MD;
@ -70,7 +70,7 @@ int main(int argc, char **argv) {
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_EODWF_lat";
CPparams.rng_prefix = "ckpoint_EODWF_rng";
CPparams.saveInterval = 10;
CPparams.saveInterval = 1;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@ -186,6 +186,8 @@ int main(int argc, char **argv) {
/////////////////////////////////////////////////////////////
// HMC parameters are serialisable
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.Run(); // no smearing

View File

@ -30,11 +30,13 @@ directory
#include <string>
template <class T> void readFile(T& out, std::string const fname){
#ifdef HAVE_LIME
Grid::emptyUserRecord record;
Grid::ScidacReader RD;
RD.open(fname);
RD.readScidacFieldRecord(out,record);
RD.close();
#endif
}

View File

@ -31,11 +31,13 @@ directory
NAMESPACE_BEGIN(Grid);
template <class T> void writeFile(T& out, std::string const fname){
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(out.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
WR.close();
#endif
}
NAMESPACE_END(Grid);
int main(int argc, char **argv) {

View File

@ -261,23 +261,25 @@ public:
fprintf(FP,"\n\n");
};
template<class CComplex>
static void BLAS(void)
{
//int nbasis, int nrhs, int coarseVol
int basis[] = { 16,32,64 };
int rhs[] = { 8,16,32 };
int vol = 4*4*4*4;
int rhs[] = { 8,12,16 };
int vol = 8*8*8*8;
int blk = 4*4*4*4;
GridBLAS blas;
int fpbits = sizeof(CComplex)*4;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
@ -285,7 +287,7 @@ public:
int N=rhs[r];
int K=basis[b];
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
@ -299,9 +301,9 @@ public:
for(int r=0;r<3;r++){
int M=basis[b];
int N=rhs[r];
int K=vol;
int K=blk;
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
@ -313,10 +315,10 @@ public:
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=rhs[r];
int N=vol;
int N=blk;
int K=basis[b];
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
@ -867,6 +869,7 @@ int main (int argc, char ** argv)
int do_memory=1;
int do_comms =1;
int do_blas =1;
int do_dslash=1;
int sel=4;
std::vector<int> L_list({8,12,16,24,32});
@ -877,6 +880,7 @@ int main (int argc, char ** argv)
std::vector<double> staggered;
int Ls=1;
if (do_dslash){
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -901,6 +905,7 @@ int main (int argc, char ** argv)
staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -909,8 +914,33 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
int NN=NN_global;
if(do_dslash){
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
fprintf(FP,"Per node summary table\n");
fprintf(FP,"\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
fprintf(FP,"\n");
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
}
fprintf(FP,"\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
if ( do_memory ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
@ -918,15 +948,6 @@ int main (int argc, char ** argv)
Benchmark::Memory();
}
if ( do_blas ) {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::BLAS();
#endif
}
if ( do_su4 ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
@ -941,28 +962,14 @@ int main (int argc, char ** argv)
Benchmark::Comms();
}
if ( do_blas ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
fprintf(FP,"Per node summary table\n");
fprintf(FP,"\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
fprintf(FP,"\n");
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
}
fprintf(FP,"\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::BLAS<ComplexD>();
Benchmark::BLAS<ComplexF>();
}
Grid_finalize();
fclose(FP);
}

View File

@ -1,16 +1,18 @@
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-debug \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=yes\
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
CXX=icpx

View File

@ -0,0 +1,23 @@
source ~/spack/share/spack/setup-env.sh
spack load c-lime
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
../../configure \
--enable-debug \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=yes\
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host -fsycl -fsanitize=leak "

View File

@ -0,0 +1,22 @@
# -fsycl-targets=spir64_gen -Xs\" -device pvc \"
# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
# -fsycl-targets=intel_gpu_pvc
unset DEVICE
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=address"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -Xarch_host -fsanitize=address -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-debug \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=yes\
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx

View File

@ -1,14 +1,22 @@
source ~/spack/share/spack/setup-env.sh
spack load c-lime
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
#spack load libefence
#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
#spack load gperftools
export TCMALLOC=/home/paboyle/gperftools/install
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
export INTELGT_AUTO_ATTACH_DISABLE=1
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
module load oneapi/release/2023.12.15.001
#module load oneapi/release/2023.12.15.001
#module use /soft/modulefiles
#module load intel_compute_runtime/release/agama-devel-682.22
export FI_CXI_DEFAULT_CQ_SIZE=131072
export FI_CXI_CQ_FILL_PERCENT=20
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
#export FI_CXI_DEFAULT_CQ_SIZE=131072
#export FI_CXI_CQ_FILL_PERCENT=20
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
#
@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
# -ftarget-register-alloc-mode=pvc:small
# -ftarget-register-alloc-mode=pvc:large
# -ftarget-register-alloc-mode=pvc:auto
#
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128
#source ~/spack/share/spack/setup-env.sh
#spack load gperftools
#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@ -0,0 +1,76 @@
Memory Bandwidth
Bytes, GB/s per node
6291456, 379.297050
100663296, 3754.674992
509607936, 6521.472413
1610612736, 8513.456479
3932160000, 9018.901766
GEMM
M, N, K, BATCH, GF/s per rank
16, 8, 16, 256, 0.564958
16, 16, 16, 256, 243.148058
16, 32, 16, 256, 440.346877
32, 8, 32, 256, 439.194136
32, 16, 32, 256, 847.334141
32, 32, 32, 256, 1430.892623
64, 8, 64, 256, 1242.756741
64, 16, 64, 256, 2196.689493
64, 32, 64, 256, 3697.458072
16, 8, 256, 256, 899.582627
16, 16, 256, 256, 1673.537756
16, 32, 256, 256, 2959.597089
32, 8, 256, 256, 1558.858630
32, 16, 256, 256, 2864.839445
32, 32, 256, 256, 4810.671254
64, 8, 256, 256, 2386.092942
64, 16, 256, 256, 4451.665937
64, 32, 256, 256, 5942.124095
8, 256, 16, 256, 799.867271
16, 256, 16, 256, 1584.624888
32, 256, 16, 256, 1949.422338
8, 256, 32, 256, 1389.417474
16, 256, 32, 256, 2668.344493
32, 256, 32, 256, 3234.162120
8, 256, 64, 256, 2150.925128
16, 256, 64, 256, 4012.488132
32, 256, 64, 256, 5154.785521
Communications
Packet bytes, direction, GB/s per node
4718592, 1, 245.026198
4718592, 2, 251.180996
4718592, 3, 361.110977
4718592, 5, 247.898447
4718592, 6, 249.867523
4718592, 7, 359.033061
15925248, 1, 255.030946
15925248, 2, 264.453890
15925248, 3, 392.949183
15925248, 5, 256.040644
15925248, 6, 264.681896
15925248, 7, 392.102622
37748736, 1, 258.823333
37748736, 2, 268.181577
37748736, 3, 401.478191
37748736, 5, 258.995363
37748736, 6, 268.206586
37748736, 7, 400.397611
Per node summary table
L , Wilson, DWF4, Staggered, GF/s per node
8 , 155, 1386, 50
12 , 694, 4208, 230
16 , 1841, 6675, 609
24 , 3934, 8573, 1641
32 , 5083, 9771, 3086
1 Memory Bandwidth
2 Bytes, GB/s per node
3 6291456, 379.297050
4 100663296, 3754.674992
5 509607936, 6521.472413
6 1610612736, 8513.456479
7 3932160000, 9018.901766
8 GEMM
9 M, N, K, BATCH, GF/s per rank
10 16, 8, 16, 256, 0.564958
11 16, 16, 16, 256, 243.148058
12 16, 32, 16, 256, 440.346877
13 32, 8, 32, 256, 439.194136
14 32, 16, 32, 256, 847.334141
15 32, 32, 32, 256, 1430.892623
16 64, 8, 64, 256, 1242.756741
17 64, 16, 64, 256, 2196.689493
18 64, 32, 64, 256, 3697.458072
19 16, 8, 256, 256, 899.582627
20 16, 16, 256, 256, 1673.537756
21 16, 32, 256, 256, 2959.597089
22 32, 8, 256, 256, 1558.858630
23 32, 16, 256, 256, 2864.839445
24 32, 32, 256, 256, 4810.671254
25 64, 8, 256, 256, 2386.092942
26 64, 16, 256, 256, 4451.665937
27 64, 32, 256, 256, 5942.124095
28 8, 256, 16, 256, 799.867271
29 16, 256, 16, 256, 1584.624888
30 32, 256, 16, 256, 1949.422338
31 8, 256, 32, 256, 1389.417474
32 16, 256, 32, 256, 2668.344493
33 32, 256, 32, 256, 3234.162120
34 8, 256, 64, 256, 2150.925128
35 16, 256, 64, 256, 4012.488132
36 32, 256, 64, 256, 5154.785521
37 Communications
38 Packet bytes, direction, GB/s per node
39 4718592, 1, 245.026198
40 4718592, 2, 251.180996
41 4718592, 3, 361.110977
42 4718592, 5, 247.898447
43 4718592, 6, 249.867523
44 4718592, 7, 359.033061
45 15925248, 1, 255.030946
46 15925248, 2, 264.453890
47 15925248, 3, 392.949183
48 15925248, 5, 256.040644
49 15925248, 6, 264.681896
50 15925248, 7, 392.102622
51 37748736, 1, 258.823333
52 37748736, 2, 268.181577
53 37748736, 3, 401.478191
54 37748736, 5, 258.995363
55 37748736, 6, 268.206586
56 37748736, 7, 400.397611
57 Per node summary table
58 L , Wilson, DWF4, Staggered, GF/s per node
59 8 , 155, 1386, 50
60 12 , 694, 4208, 230
61 16 , 1841, 6675, 609
62 24 , 3934, 8573, 1641
63 32 , 5083, 9771, 3086

View File

@ -0,0 +1,702 @@
RANK 1 using GPU 1
RANK 5 using GPU 6
RANK 0 using GPU 0
RANK 2 using GPU 2
RANK 3 using GPU 3
RANK 6 using GPU 5
RANK 7 using GPU 4
RANK 4 using GPU 7
world_rank 0 has 1 devices
AcceleratorHipInit: ========================
AcceleratorHipInit: Device Number : 0
AcceleratorHipInit: ========================
AcceleratorHipInit: Device identifier: AMD Instinct MI250X
AcceleratorHipInit: totalGlobalMem: 68702699520
AcceleratorHipInit: isMultiGpuBoard: 0
AcceleratorHipInit: warpSize: 64
AcceleratorHipInit: using default device
AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding
AcceleratorHipInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0000:c1:00.0
AcceleratorHipInit: ================================================
SharedMemoryMpi: World communicator of size 8
SharedMemoryMpi: Node communicator of size 8
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host frontier01320
Grid : Message : Requested 4294967296 byte stencil comms buffers
Grid : Message : MemoryManager Cache 54962159616 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using hipMalloc
Grid : Message : 0.293720 s : ==================================================================================
Grid : Message : 0.293790 s : = Grid is setup to use 1 threads
Grid : Message : 0.293800 s : ==================================================================================
Grid : Message : 0.293810 s : Grid Default Decomposition patterns
Grid : Message : 0.293810 s : OpenMP threads : 1
Grid : Message : 0.293820 s : MPI tasks : 1 2 2 2
Grid : Message : 0.293870 s : vReal : 512bits ; 1 2 2 2
Grid : Message : 0.293890 s : vRealF : 512bits ; 2 2 2 2
Grid : Message : 0.293910 s : vRealD : 512bits ; 1 2 2 2
Grid : Message : 0.293920 s : vComplex : 512bits ; 1 1 2 2
Grid : Message : 0.293930 s : vComplexF : 512bits ; 1 2 2 2
Grid : Message : 0.293960 s : vComplexD : 512bits ; 1 1 2 2
Grid : Message : 0.293970 s : ==================================================================================
Grid : Message : 0.293980 s : ==================================================================================
Grid : Message : 0.293990 s : Clover dslash 4D vectorised (temporarily Wilson)
Grid : Message : 0.294000 s : ==================================================================================
Grid : Message : 0.301330 s : ==================================================================================
Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume
Grid : Message : 0.301370 s : * Nc : 3
Grid : Message : 0.301380 s : * Global volume : 8 16 16 16
Grid : Message : 0.301410 s : * Ls : 1
Grid : Message : 0.301420 s : * ranks : 8
Grid : Message : 0.301430 s : * nodes : 1
Grid : Message : 0.301440 s : * ranks/node : 8
Grid : Message : 0.301450 s : * ranks geom : 1 2 2 2
Grid : Message : 0.301460 s : * Using 1 threads
Grid : Message : 0.301470 s : ==================================================================================
Grid : Message : 0.345030 s : Initialised RNGs
Grid : Message : 0.158302 s : ==================================================================================
Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute
Grid : Message : 0.158312 s : * SINGLE precision
Grid : Message : 0.158313 s : ==================================================================================
Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344
Grid : Message : 0.240711 s : Deo mflop/s = 154914.0 (130.8) 139367.7-159565.9
Grid : Message : 0.240715 s : Deo mflop/s per rank 19364.3
Grid : Message : 0.240716 s : Deo mflop/s per node 154914.0
Grid : Message : 0.240718 s : ==================================================================================
Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels
Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute
Grid : Message : 0.240719 s : * SINGLE precision
Grid : Message : 0.240719 s : ==================================================================================
Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0
Grid : Message : 0.315033 s : Deo mflop/s = 151459.5 (142.0) 131856.9-157286.4
Grid : Message : 0.315036 s : Deo mflop/s per rank 18932.4
Grid : Message : 0.315037 s : Deo mflop/s per node 151459.5
Grid : Message : 0.315038 s : ==================================================================================
Grid : Message : 0.315040 s : 8^4 x 1 Deo Best mflop/s = 154914.0 ; 154914.0 per node
Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s = 151459.5 ; 151459.5 per node
Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ;
Grid : Message : 0.315044 s : ==================================================================================
Grid : Message : 0.316507 s : ==================================================================================
Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume
Grid : Message : 0.316511 s : * Nc : 3
Grid : Message : 0.316512 s : * Global volume : 12 24 24 24
Grid : Message : 0.316515 s : * Ls : 1
Grid : Message : 0.316516 s : * ranks : 8
Grid : Message : 0.316517 s : * nodes : 1
Grid : Message : 0.316518 s : * ranks/node : 8
Grid : Message : 0.316518 s : * ranks geom : 1 2 2 2
Grid : Message : 0.316519 s : * Using 1 threads
Grid : Message : 0.316520 s : ==================================================================================
Grid : Message : 0.327883 s : Initialised RNGs
Grid : Message : 0.786395 s : ==================================================================================
Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute
Grid : Message : 0.786406 s : * SINGLE precision
Grid : Message : 0.786406 s : ==================================================================================
Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0
Grid : Message : 0.871659 s : Deo mflop/s = 684982.2 (632.4) 609162.5-714594.5
Grid : Message : 0.871663 s : Deo mflop/s per rank 85622.8
Grid : Message : 0.871664 s : Deo mflop/s per node 684982.2
Grid : Message : 0.871665 s : ==================================================================================
Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels
Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute
Grid : Message : 0.871665 s : * SINGLE precision
Grid : Message : 0.871665 s : ==================================================================================
Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0
Grid : Message : 0.953702 s : Deo mflop/s = 693556.6 (576.5) 663552.0-719204.7
Grid : Message : 0.953705 s : Deo mflop/s per rank 86694.6
Grid : Message : 0.953706 s : Deo mflop/s per node 693556.6
Grid : Message : 0.953707 s : ==================================================================================
Grid : Message : 0.953708 s : 12^4 x 1 Deo Best mflop/s = 693556.6 ; 693556.6 per node
Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s = 684982.2 ; 684982.2 per node
Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ;
Grid : Message : 0.953713 s : ==================================================================================
Grid : Message : 0.957609 s : ==================================================================================
Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume
Grid : Message : 0.957614 s : * Nc : 3
Grid : Message : 0.957615 s : * Global volume : 16 32 32 32
Grid : Message : 0.957620 s : * Ls : 1
Grid : Message : 0.957621 s : * ranks : 8
Grid : Message : 0.957622 s : * nodes : 1
Grid : Message : 0.957623 s : * ranks/node : 8
Grid : Message : 0.957623 s : * ranks geom : 1 2 2 2
Grid : Message : 0.957624 s : * Using 1 threads
Grid : Message : 0.957625 s : ==================================================================================
Grid : Message : 0.985828 s : Initialised RNGs
Grid : Message : 2.379761 s : ==================================================================================
Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute
Grid : Message : 2.379774 s : * SINGLE precision
Grid : Message : 2.379775 s : ==================================================================================
Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0
Grid : Message : 2.486725 s : Deo mflop/s = 1803226.1 (1139.4) 1646362.3-1864135.1
Grid : Message : 2.486729 s : Deo mflop/s per rank 225403.3
Grid : Message : 2.486731 s : Deo mflop/s per node 1803226.1
Grid : Message : 2.486732 s : ==================================================================================
Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels
Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute
Grid : Message : 2.486732 s : * SINGLE precision
Grid : Message : 2.486732 s : ==================================================================================
Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0
Grid : Message : 2.584412 s : Deo mflop/s = 1840587.3 (1119.6) 1779401.7-1914791.0
Grid : Message : 2.584415 s : Deo mflop/s per rank 230073.4
Grid : Message : 2.584416 s : Deo mflop/s per node 1840587.3
Grid : Message : 2.584417 s : ==================================================================================
Grid : Message : 2.584418 s : 16^4 x 1 Deo Best mflop/s = 1840587.3 ; 1840587.3 per node
Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s = 1803226.1 ; 1803226.1 per node
Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ;
Grid : Message : 2.584423 s : ==================================================================================
Grid : Message : 2.592858 s : ==================================================================================
Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume
Grid : Message : 2.592863 s : * Nc : 3
Grid : Message : 2.592864 s : * Global volume : 24 48 48 48
Grid : Message : 2.592869 s : * Ls : 1
Grid : Message : 2.592870 s : * ranks : 8
Grid : Message : 2.592871 s : * nodes : 1
Grid : Message : 2.592872 s : * ranks/node : 8
Grid : Message : 2.592872 s : * ranks geom : 1 2 2 2
Grid : Message : 2.592873 s : * Using 1 threads
Grid : Message : 2.592874 s : ==================================================================================
Grid : Message : 2.715623 s : Initialised RNGs
Grid : Message : 9.608838 s : ==================================================================================
Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute
Grid : Message : 9.608854 s : * SINGLE precision
Grid : Message : 9.608855 s : ==================================================================================
Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0
Grid : Message : 9.870309 s : Deo mflop/s = 3861903.3 (1708.9) 3511078.3-3937368.2
Grid : Message : 9.870313 s : Deo mflop/s per rank 482737.9
Grid : Message : 9.870314 s : Deo mflop/s per node 3861903.3
Grid : Message : 9.870315 s : ==================================================================================
Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels
Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute
Grid : Message : 9.870317 s : * SINGLE precision
Grid : Message : 9.870317 s : ==================================================================================
Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0
Grid : Message : 10.101624 s : Deo mflop/s = 3933599.5 (1412.7) 3835758.7-4008152.3
Grid : Message : 10.101627 s : Deo mflop/s per rank 491699.9
Grid : Message : 10.101628 s : Deo mflop/s per node 3933599.5
Grid : Message : 10.101629 s : ==================================================================================
Grid : Message : 10.101629 s : 24^4 x 1 Deo Best mflop/s = 3933599.5 ; 3933599.5 per node
Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s = 3861903.3 ; 3861903.3 per node
Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ;
Grid : Message : 10.101634 s : ==================================================================================
Grid : Message : 10.139642 s : ==================================================================================
Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume
Grid : Message : 10.139653 s : * Nc : 3
Grid : Message : 10.139654 s : * Global volume : 32 64 64 64
Grid : Message : 10.139661 s : * Ls : 1
Grid : Message : 10.139661 s : * ranks : 8
Grid : Message : 10.139662 s : * nodes : 1
Grid : Message : 10.139662 s : * ranks/node : 8
Grid : Message : 10.139662 s : * ranks geom : 1 2 2 2
Grid : Message : 10.139663 s : * Using 1 threads
Grid : Message : 10.139663 s : ==================================================================================
Grid : Message : 10.502161 s : Initialised RNGs
Grid : Message : 32.211092 s : ==================================================================================
Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute
Grid : Message : 32.211109 s : * SINGLE precision
Grid : Message : 32.211110 s : ==================================================================================
Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0
Grid : Message : 32.841732 s : Deo mflop/s = 4988499.9 (2722.5) 4244837.8-5120022.3
Grid : Message : 32.841736 s : Deo mflop/s per rank 623562.5
Grid : Message : 32.841737 s : Deo mflop/s per node 4988499.9
Grid : Message : 32.841738 s : ==================================================================================
Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels
Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute
Grid : Message : 32.841740 s : * SINGLE precision
Grid : Message : 32.841740 s : ==================================================================================
Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0
Grid : Message : 33.407442 s : Deo mflop/s = 5082758.0 (1883.1) 4971027.0-5205119.6
Grid : Message : 33.407446 s : Deo mflop/s per rank 635344.7
Grid : Message : 33.407447 s : Deo mflop/s per node 5082758.0
Grid : Message : 33.407448 s : ==================================================================================
Grid : Message : 33.407448 s : 32^4 x 1 Deo Best mflop/s = 5082758.0 ; 5082758.0 per node
Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s = 4988499.9 ; 4988499.9 per node
Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ;
Grid : Message : 33.407453 s : ==================================================================================
Grid : Message : 33.506785 s : ==================================================================================
Grid : Message : 33.506798 s : Domain wall dslash 4D vectorised
Grid : Message : 33.506799 s : ==================================================================================
Grid : Message : 33.530686 s : ==================================================================================
Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume
Grid : Message : 33.530690 s : * Nc : 3
Grid : Message : 33.530691 s : * Global volume : 8 16 16 16
Grid : Message : 33.530698 s : * Ls : 12
Grid : Message : 33.530699 s : * ranks : 8
Grid : Message : 33.530700 s : * nodes : 1
Grid : Message : 33.530701 s : * ranks/node : 8
Grid : Message : 33.530702 s : * ranks geom : 1 2 2 2
Grid : Message : 33.530703 s : * Using 1 threads
Grid : Message : 33.530704 s : ==================================================================================
Grid : Message : 33.545465 s : Initialised RNGs
Grid : Message : 33.752384 s : ==================================================================================
Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute
Grid : Message : 33.752399 s : * SINGLE precision
Grid : Message : 33.752400 s : ==================================================================================
Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0
Grid : Message : 33.851977 s : Deo mflop/s = 1383287.7 (849.8) 1321205.8-1420651.4
Grid : Message : 33.851981 s : Deo mflop/s per rank 172911.0
Grid : Message : 33.851983 s : Deo mflop/s per node 1383287.7
Grid : Message : 33.851984 s : ==================================================================================
Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels
Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute
Grid : Message : 33.851984 s : * SINGLE precision
Grid : Message : 33.851984 s : ==================================================================================
Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0
Grid : Message : 33.949240 s : Deo mflop/s = 1386335.8 (734.6) 1341325.6-1428330.6
Grid : Message : 33.949243 s : Deo mflop/s per rank 173292.0
Grid : Message : 33.949244 s : Deo mflop/s per node 1386335.8
Grid : Message : 33.949245 s : ==================================================================================
Grid : Message : 33.949245 s : 8^4 x 12 Deo Best mflop/s = 1386335.8 ; 1386335.8 per node
Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s = 1383287.7 ; 1383287.7 per node
Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ;
Grid : Message : 33.949250 s : ==================================================================================
Grid : Message : 33.952789 s : ==================================================================================
Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume
Grid : Message : 33.952794 s : * Nc : 3
Grid : Message : 33.952795 s : * Global volume : 12 24 24 24
Grid : Message : 33.952800 s : * Ls : 12
Grid : Message : 33.952801 s : * ranks : 8
Grid : Message : 33.952802 s : * nodes : 1
Grid : Message : 33.952803 s : * ranks/node : 8
Grid : Message : 33.952803 s : * ranks geom : 1 2 2 2
Grid : Message : 33.952804 s : * Using 1 threads
Grid : Message : 33.952805 s : ==================================================================================
Grid : Message : 34.362200 s : Initialised RNGs
Grid : Message : 34.969821 s : ==================================================================================
Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute
Grid : Message : 34.969834 s : * SINGLE precision
Grid : Message : 34.969835 s : ==================================================================================
Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0
Grid : Message : 35.135558 s : Deo mflop/s = 4208495.6 (2165.0) 4053699.5-4315228.5
Grid : Message : 35.135562 s : Deo mflop/s per rank 526062.0
Grid : Message : 35.135563 s : Deo mflop/s per node 4208495.6
Grid : Message : 35.135564 s : ==================================================================================
Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels
Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute
Grid : Message : 35.135565 s : * SINGLE precision
Grid : Message : 35.135565 s : ==================================================================================
Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0
Grid : Message : 35.299715 s : Deo mflop/s = 4156968.7 (1450.2) 4053699.5-4219939.5
Grid : Message : 35.299718 s : Deo mflop/s per rank 519621.1
Grid : Message : 35.299719 s : Deo mflop/s per node 4156968.7
Grid : Message : 35.299721 s : ==================================================================================
Grid : Message : 35.299721 s : 12^4 x 12 Deo Best mflop/s = 4208495.6 ; 4208495.6 per node
Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s = 4156968.7 ; 4156968.7 per node
Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ;
Grid : Message : 35.299726 s : ==================================================================================
Grid : Message : 35.309687 s : ==================================================================================
Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume
Grid : Message : 35.309694 s : * Nc : 3
Grid : Message : 35.309695 s : * Global volume : 16 32 32 32
Grid : Message : 35.309701 s : * Ls : 12
Grid : Message : 35.309702 s : * ranks : 8
Grid : Message : 35.309703 s : * nodes : 1
Grid : Message : 35.309704 s : * ranks/node : 8
Grid : Message : 35.309704 s : * ranks geom : 1 2 2 2
Grid : Message : 35.309705 s : * Using 1 threads
Grid : Message : 35.309706 s : ==================================================================================
Grid : Message : 35.448780 s : Initialised RNGs
Grid : Message : 38.468764 s : ==================================================================================
Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute
Grid : Message : 38.468779 s : * SINGLE precision
Grid : Message : 38.468780 s : ==================================================================================
Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0
Grid : Message : 38.801040 s : Deo mflop/s = 6674673.6 (2168.6) 6484445.4-6797200.1
Grid : Message : 38.801044 s : Deo mflop/s per rank 834334.2
Grid : Message : 38.801045 s : Deo mflop/s per node 6674673.6
Grid : Message : 38.801046 s : ==================================================================================
Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels
Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute
Grid : Message : 38.801049 s : * SINGLE precision
Grid : Message : 38.801049 s : ==================================================================================
Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0
Grid : Message : 39.129783 s : Deo mflop/s = 6560128.4 (2117.4) 6405846.1-6679081.3
Grid : Message : 39.129786 s : Deo mflop/s per rank 820016.1
Grid : Message : 39.129787 s : Deo mflop/s per node 6560128.4
Grid : Message : 39.129788 s : ==================================================================================
Grid : Message : 39.129788 s : 16^4 x 12 Deo Best mflop/s = 6674673.6 ; 6674673.6 per node
Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s = 6560128.4 ; 6560128.4 per node
Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ;
Grid : Message : 39.129795 s : ==================================================================================
Grid : Message : 39.161251 s : ==================================================================================
Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume
Grid : Message : 39.161266 s : * Nc : 3
Grid : Message : 39.161267 s : * Global volume : 24 48 48 48
Grid : Message : 39.161274 s : * Ls : 12
Grid : Message : 39.161275 s : * ranks : 8
Grid : Message : 39.161276 s : * nodes : 1
Grid : Message : 39.161277 s : * ranks/node : 8
Grid : Message : 39.161277 s : * ranks geom : 1 2 2 2
Grid : Message : 39.161278 s : * Using 1 threads
Grid : Message : 39.161279 s : ==================================================================================
Grid : Message : 39.911996 s : Initialised RNGs
Grid : Message : 54.971914 s : ==================================================================================
Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute
Grid : Message : 54.971930 s : * SINGLE precision
Grid : Message : 54.971931 s : ==================================================================================
Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0
Grid : Message : 56.309462 s : Deo mflop/s = 8572660.7 (1374.9) 8483366.4-8644399.6
Grid : Message : 56.309467 s : Deo mflop/s per rank 1071582.6
Grid : Message : 56.309468 s : Deo mflop/s per node 8572660.7
Grid : Message : 56.309469 s : ==================================================================================
Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels
Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute
Grid : Message : 56.309473 s : * SINGLE precision
Grid : Message : 56.309474 s : ==================================================================================
Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0
Grid : Message : 57.640714 s : Deo mflop/s = 8200141.3 (1445.8) 8113545.6-8286307.9
Grid : Message : 57.640717 s : Deo mflop/s per rank 1025017.7
Grid : Message : 57.640718 s : Deo mflop/s per node 8200141.3
Grid : Message : 57.640719 s : ==================================================================================
Grid : Message : 57.640720 s : 24^4 x 12 Deo Best mflop/s = 8572660.7 ; 8572660.7 per node
Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s = 8200141.3 ; 8200141.3 per node
Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ;
Grid : Message : 57.640727 s : ==================================================================================
Grid : Message : 57.806175 s : ==================================================================================
Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume
Grid : Message : 57.806191 s : * Nc : 3
Grid : Message : 57.806192 s : * Global volume : 32 64 64 64
Grid : Message : 57.806200 s : * Ls : 12
Grid : Message : 57.806200 s : * ranks : 8
Grid : Message : 57.806200 s : * nodes : 1
Grid : Message : 57.806200 s : * ranks/node : 8
Grid : Message : 57.806200 s : * ranks geom : 1 2 2 2
Grid : Message : 57.806201 s : * Using 1 threads
Grid : Message : 57.806201 s : ==================================================================================
Grid : Message : 60.313153 s : Initialised RNGs
Grid : Message : 107.830286 s : ==================================================================================
Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute
Grid : Message : 107.830308 s : * SINGLE precision
Grid : Message : 107.830309 s : ==================================================================================
Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0
Grid : Message : 111.479625 s : Deo mflop/s = 9771387.8 (1000.8) 9688589.9-9830800.0
Grid : Message : 111.479629 s : Deo mflop/s per rank 1221423.5
Grid : Message : 111.479630 s : Deo mflop/s per node 9771387.8
Grid : Message : 111.479631 s : ==================================================================================
Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels
Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute
Grid : Message : 111.479631 s : * SINGLE precision
Grid : Message : 111.479631 s : ==================================================================================
Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0
Grid : Message : 115.406573 s : Deo mflop/s = 8785297.3 (1739.6) 8628282.5-8911307.5
Grid : Message : 115.406576 s : Deo mflop/s per rank 1098162.2
Grid : Message : 115.406577 s : Deo mflop/s per node 8785297.3
Grid : Message : 115.406578 s : ==================================================================================
Grid : Message : 115.406578 s : 32^4 x 12 Deo Best mflop/s = 9771387.8 ; 9771387.8 per node
Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s = 8785297.3 ; 8785297.3 per node
Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ;
Grid : Message : 115.406582 s : ==================================================================================
Grid : Message : 115.918888 s : ==================================================================================
Grid : Message : 115.918902 s : Improved Staggered dslash 4D vectorised
Grid : Message : 115.918903 s : ==================================================================================
Grid : Message : 115.920344 s : ==================================================================================
Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume
Grid : Message : 115.920347 s : * Global volume : 8 16 16 16
Grid : Message : 115.920354 s : * ranks : 8
Grid : Message : 115.920355 s : * nodes : 1
Grid : Message : 115.920356 s : * ranks/node : 8
Grid : Message : 115.920357 s : * ranks geom : 1 2 2 2
Grid : Message : 115.920376 s : * Using 1 threads
Grid : Message : 115.920377 s : ==================================================================================
Grid : Message : 115.923522 s : Initialised RNGs
Grid : Message : 116.904870 s : ==================================================================================
Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels
Grid : Message : 116.904960 s : * SINGLE precision
Grid : Message : 116.904970 s : ==================================================================================
Grid : Message : 116.288979 s : Deo mflop/s = 49708.9 (22.9) 44075.3-50609.3
Grid : Message : 116.289000 s : Deo mflop/s per rank 6213.6
Grid : Message : 116.289002 s : Deo mflop/s per node 49708.9
Grid : Message : 116.289003 s : ==================================================================================
Grid : Message : 116.289004 s : * SINGLE precision
Grid : Message : 116.289005 s : ==================================================================================
Grid : Message : 116.481632 s : Deo mflop/s = 49737.1 (13.5) 48517.0-50338.0
Grid : Message : 116.481639 s : Deo mflop/s per rank 6217.1
Grid : Message : 116.481640 s : Deo mflop/s per node 49737.1
Grid : Message : 116.481641 s : ==================================================================================
Grid : Message : 116.481642 s : 8^4 Deo Best mflop/s = 49737.1 ; 49737.1 per node
Grid : Message : 116.481644 s : 8^4 Deo Worst mflop/s = 49708.9 ; 49708.9 per node
Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ;
Grid : Message : 116.481647 s : ==================================================================================
Grid : Message : 116.483458 s : ==================================================================================
Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume
Grid : Message : 116.483462 s : * Global volume : 12 24 24 24
Grid : Message : 116.483465 s : * ranks : 8
Grid : Message : 116.483466 s : * nodes : 1
Grid : Message : 116.483466 s : * ranks/node : 8
Grid : Message : 116.483466 s : * ranks geom : 1 2 2 2
Grid : Message : 116.483467 s : * Using 1 threads
Grid : Message : 116.483468 s : ==================================================================================
Grid : Message : 116.489279 s : Initialised RNGs
Grid : Message : 116.945016 s : ==================================================================================
Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels
Grid : Message : 116.945026 s : * SINGLE precision
Grid : Message : 116.945027 s : ==================================================================================
Grid : Message : 117.159821 s : Deo mflop/s = 229778.4 (89.5) 223656.1-233547.5
Grid : Message : 117.159835 s : Deo mflop/s per rank 28722.3
Grid : Message : 117.159837 s : Deo mflop/s per node 229778.4
Grid : Message : 117.159838 s : ==================================================================================
Grid : Message : 117.159838 s : * SINGLE precision
Grid : Message : 117.159838 s : ==================================================================================
Grid : Message : 117.371102 s : Deo mflop/s = 229516.6 (61.8) 225781.1-233547.5
Grid : Message : 117.371109 s : Deo mflop/s per rank 28689.6
Grid : Message : 117.371110 s : Deo mflop/s per node 229516.6
Grid : Message : 117.371111 s : ==================================================================================
Grid : Message : 117.371111 s : 12^4 Deo Best mflop/s = 229778.4 ; 229778.4 per node
Grid : Message : 117.371113 s : 12^4 Deo Worst mflop/s = 229516.6 ; 229516.6 per node
Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ;
Grid : Message : 117.371116 s : ==================================================================================
Grid : Message : 117.373669 s : ==================================================================================
Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume
Grid : Message : 117.373674 s : * Global volume : 16 32 32 32
Grid : Message : 117.373678 s : * ranks : 8
Grid : Message : 117.373679 s : * nodes : 1
Grid : Message : 117.373679 s : * ranks/node : 8
Grid : Message : 117.373679 s : * ranks geom : 1 2 2 2
Grid : Message : 117.373680 s : * Using 1 threads
Grid : Message : 117.373681 s : ==================================================================================
Grid : Message : 117.386495 s : Initialised RNGs
Grid : Message : 118.755695 s : ==================================================================================
Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels
Grid : Message : 118.755707 s : * SINGLE precision
Grid : Message : 118.755708 s : ==================================================================================
Grid : Message : 119.178990 s : Deo mflop/s = 608844.0 (126.1) 596065.5-615608.7
Grid : Message : 119.179160 s : Deo mflop/s per rank 76105.5
Grid : Message : 119.179180 s : Deo mflop/s per node 608844.0
Grid : Message : 119.179190 s : ==================================================================================
Grid : Message : 119.179200 s : * SINGLE precision
Grid : Message : 119.179200 s : ==================================================================================
Grid : Message : 119.271093 s : Deo mflop/s = 605259.7 (188.7) 591372.1-614349.7
Grid : Message : 119.271101 s : Deo mflop/s per rank 75657.5
Grid : Message : 119.271103 s : Deo mflop/s per node 605259.7
Grid : Message : 119.271104 s : ==================================================================================
Grid : Message : 119.271105 s : 16^4 Deo Best mflop/s = 608844.0 ; 608844.0 per node
Grid : Message : 119.271107 s : 16^4 Deo Worst mflop/s = 605259.7 ; 605259.7 per node
Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ;
Grid : Message : 119.271110 s : ==================================================================================
Grid : Message : 119.275303 s : ==================================================================================
Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume
Grid : Message : 119.275309 s : * Global volume : 24 48 48 48
Grid : Message : 119.275315 s : * ranks : 8
Grid : Message : 119.275316 s : * nodes : 1
Grid : Message : 119.275317 s : * ranks/node : 8
Grid : Message : 119.275317 s : * ranks geom : 1 2 2 2
Grid : Message : 119.275318 s : * Using 1 threads
Grid : Message : 119.275319 s : ==================================================================================
Grid : Message : 119.328765 s : Initialised RNGs
Grid : Message : 126.866160 s : ==================================================================================
Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels
Grid : Message : 126.866280 s : * SINGLE precision
Grid : Message : 126.866290 s : ==================================================================================
Grid : Message : 126.604376 s : Deo mflop/s = 1641161.6 (335.5) 1619660.5-1663961.9
Grid : Message : 126.604392 s : Deo mflop/s per rank 205145.2
Grid : Message : 126.604394 s : Deo mflop/s per node 1641161.6
Grid : Message : 126.604395 s : ==================================================================================
Grid : Message : 126.604396 s : * SINGLE precision
Grid : Message : 126.604396 s : ==================================================================================
Grid : Message : 127.829420 s : Deo mflop/s = 1620972.4 (344.9) 1602593.4-1644174.3
Grid : Message : 127.829520 s : Deo mflop/s per rank 202621.6
Grid : Message : 127.829530 s : Deo mflop/s per node 1620972.4
Grid : Message : 127.829540 s : ==================================================================================
Grid : Message : 127.829550 s : 24^4 Deo Best mflop/s = 1641161.6 ; 1641161.6 per node
Grid : Message : 127.829570 s : 24^4 Deo Worst mflop/s = 1620972.4 ; 1620972.4 per node
Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ;
Grid : Message : 127.829600 s : ==================================================================================
Grid : Message : 127.107891 s : ==================================================================================
Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume
Grid : Message : 127.107904 s : * Global volume : 32 64 64 64
Grid : Message : 127.107912 s : * ranks : 8
Grid : Message : 127.107913 s : * nodes : 1
Grid : Message : 127.107914 s : * ranks/node : 8
Grid : Message : 127.107914 s : * ranks geom : 1 2 2 2
Grid : Message : 127.107915 s : * Using 1 threads
Grid : Message : 127.107916 s : ==================================================================================
Grid : Message : 127.257116 s : Initialised RNGs
Grid : Message : 148.527930 s : ==================================================================================
Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels
Grid : Message : 148.527942 s : * SINGLE precision
Grid : Message : 148.527943 s : ==================================================================================
Grid : Message : 149.401625 s : Deo mflop/s = 3085543.7 (956.0) 2934476.4-3115147.4
Grid : Message : 149.401643 s : Deo mflop/s per rank 385693.0
Grid : Message : 149.401645 s : Deo mflop/s per node 3085543.7
Grid : Message : 149.401646 s : ==================================================================================
Grid : Message : 149.401647 s : * SINGLE precision
Grid : Message : 149.401648 s : ==================================================================================
Grid : Message : 150.204533 s : Deo mflop/s = 3053468.5 (343.9) 3030688.8-3077255.0
Grid : Message : 150.204540 s : Deo mflop/s per rank 381683.6
Grid : Message : 150.204541 s : Deo mflop/s per node 3053468.5
Grid : Message : 150.204542 s : ==================================================================================
Grid : Message : 150.204543 s : 32^4 Deo Best mflop/s = 3085543.7 ; 3085543.7 per node
Grid : Message : 150.204545 s : 32^4 Deo Worst mflop/s = 3053468.5 ; 3053468.5 per node
Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S
Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ;
Grid : Message : 150.204548 s : ==================================================================================
Grid : Message : 150.292848 s : ==================================================================================
Grid : Message : 150.292864 s : Summary table Ls=12
Grid : Message : 150.292866 s : ==================================================================================
Grid : Message : 150.292866 s : L Clover DWF4 Staggered
Grid : Message : 150.292867 s : 8 154914.0 1386335.8 49737.1
Grid : Message : 150.292880 s : 12 693556.6 4208495.6 229778.4
Grid : Message : 150.292882 s : 16 1840587.3 6674673.6 608844.0
Grid : Message : 150.292884 s : 24 3933599.5 8572660.7 1641161.6
Grid : Message : 150.292886 s : 32 5082758.0 9771387.8 3085543.7
Grid : Message : 150.292888 s : ==================================================================================
Grid : Message : 150.292888 s : ==================================================================================
Grid : Message : 150.292888 s : Memory benchmark
Grid : Message : 150.292888 s : ==================================================================================
Grid : Message : 150.295495 s : ==================================================================================
Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth
Grid : Message : 150.295498 s : ==================================================================================
Grid : Message : 150.295499 s : L bytes GB/s Gflop/s seconds GB/s / node
Grid : Message : 150.295500 s : ----------------------------------------------------------
Grid : Message : 160.682233 s : 8 6291456.000 379.297 31.608 10.367 379.297
Grid : Message : 161.851979 s : 16 100663296.000 3754.675 312.890 1.047 3754.675
Grid : Message : 162.458098 s : 24 509607936.000 6521.472 543.456 0.603 6521.472
Grid : Message : 162.924116 s : 32 1610612736.000 8513.456 709.455 0.462 8513.456
Grid : Message : 163.363877 s : 40 3932160000.000 9018.902 751.575 0.436 9018.902
Grid : Message : 163.363976 s : ==================================================================================
Grid : Message : 163.363978 s : Batched BLAS benchmark
Grid : Message : 163.363979 s : ==================================================================================
hipblasCreate
Grid : Message : 163.364046 s : ==================================================================================
Grid : Message : 163.364048 s : = batched GEMM (double precision)
Grid : Message : 163.364048 s : ==================================================================================
Grid : Message : 163.364048 s : M N K Gflop/s / rank (coarse mrhs)
Grid : Message : 163.364049 s : ----------------------------------------------------------
Grid : Message : 163.438476 s : 16 8 16 256 0.565
Grid : Message : 163.438944 s : 16 16 16 256 243.148
Grid : Message : 163.439501 s : 16 32 16 256 440.347
Grid : Message : 163.440003 s : 32 8 32 256 439.194
Grid : Message : 163.440463 s : 32 16 32 256 847.334
Grid : Message : 163.441051 s : 32 32 32 256 1430.893
Grid : Message : 163.441679 s : 64 8 64 256 1242.757
Grid : Message : 163.442354 s : 64 16 64 256 2196.689
Grid : Message : 163.443196 s : 64 32 64 256 3697.458
Grid : Message : 163.443200 s : ----------------------------------------------------------
Grid : Message : 163.443201 s : M N K Gflop/s / rank (block project)
Grid : Message : 163.443202 s : ----------------------------------------------------------
Grid : Message : 163.444013 s : 16 8 256 256 899.583
Grid : Message : 163.444933 s : 16 16 256 256 1673.538
Grid : Message : 163.446013 s : 16 32 256 256 2959.597
Grid : Message : 163.446951 s : 32 8 256 256 1558.859
Grid : Message : 163.447970 s : 32 16 256 256 2864.839
Grid : Message : 163.449240 s : 32 32 256 256 4810.671
Grid : Message : 163.450524 s : 64 8 256 256 2386.093
Grid : Message : 163.451877 s : 64 16 256 256 4451.666
Grid : Message : 163.453806 s : 64 32 256 256 5942.124
Grid : Message : 163.453809 s : ----------------------------------------------------------
Grid : Message : 163.453810 s : M N K Gflop/s / rank (block promote)
Grid : Message : 163.453811 s : ----------------------------------------------------------
Grid : Message : 163.454716 s : 8 256 16 256 799.867
Grid : Message : 163.455690 s : 16 256 16 256 1584.625
Grid : Message : 163.457209 s : 32 256 16 256 1949.422
Grid : Message : 163.458254 s : 8 256 32 256 1389.417
Grid : Message : 163.459339 s : 16 256 32 256 2668.344
Grid : Message : 163.461158 s : 32 256 32 256 3234.162
Grid : Message : 163.462566 s : 8 256 64 256 2150.925
Grid : Message : 163.464066 s : 16 256 64 256 4012.488
Grid : Message : 163.466272 s : 32 256 64 256 5154.786
Grid : Message : 163.466276 s : ==================================================================================
Grid : Message : 163.466277 s : ==================================================================================
Grid : Message : 163.466278 s : Communications benchmark
Grid : Message : 163.466279 s : ==================================================================================
Grid : Message : 163.466280 s : ====================================================================================================
Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions
Grid : Message : 163.466281 s : ====================================================================================================
Grid : Message : 163.466281 s : L Ls bytes MB/s uni MB/s bidi
Grid : Message : 163.521339 s : 16 12 4718592 122513.099 245026.198
Grid : Message : 163.551417 s : 16 12 4718592 125590.498 251180.996
Grid : Message : 163.572339 s : 16 12 4718592 180555.489 361110.977
Grid : Message : 163.602810 s : 16 12 4718592 123949.223 247898.447
Grid : Message : 163.633041 s : 16 12 4718592 124933.761 249867.523
Grid : Message : 163.654084 s : 16 12 4718592 179516.530 359033.061
Grid : Message : 163.756280 s : 24 12 15925248 127515.473 255030.946
Grid : Message : 163.852651 s : 24 12 15925248 132226.945 264453.890
Grid : Message : 163.917510 s : 24 12 15925248 196474.591 392949.183
Grid : Message : 164.170390 s : 24 12 15925248 128020.322 256040.644
Grid : Message : 164.113321 s : 24 12 15925248 132340.948 264681.896
Grid : Message : 164.178314 s : 24 12 15925248 196051.311 392102.622
Grid : Message : 164.413983 s : 32 12 37748736 129411.666 258823.333
Grid : Message : 164.639218 s : 32 12 37748736 134090.789 268181.577
Grid : Message : 164.789675 s : 32 12 37748736 200739.096 401478.191
Grid : Message : 165.228910 s : 32 12 37748736 129497.681 258995.363
Grid : Message : 165.248096 s : 32 12 37748736 134103.293 268206.586
Grid : Message : 165.398958 s : 32 12 37748736 200198.805 400397.611
Grid : Message : 165.399411 s : ==================================================================================
Grid : Message : 165.399413 s : Per Node Summary table Ls=12
Grid : Message : 165.399414 s : ==================================================================================
Grid : Message : 165.399414 s : L Clover DWF4 Staggered (GF/s per node)
Grid : Message : 165.399417 s : 8 154914.003 1386335.817 49737.127
Grid : Message : 165.399423 s : 12 693556.579 4208495.611 229778.435
Grid : Message : 165.399426 s : 16 1840587.280 6674673.647 608844.000
Grid : Message : 165.399429 s : 24 3933599.545 8572660.656 1641161.613
Grid : Message : 165.399432 s : 32 5082757.996 9771387.820 3085543.742
Grid : Message : 165.399435 s : ==================================================================================
Grid : Message : 165.399435 s : ==================================================================================
Grid : Message : 165.399435 s : Comparison point result: 9172024.238 Mflop/s per node
Grid : Message : 165.399436 s : Comparison point is 0.5*(9771387.820+8572660.656)
Grid : Message : 165.399438 s : ==================================================================================
Grid : Message : 165.399438 s : *******************************************
Grid : Message : 165.399438 s : ******* Grid Finalize ******
Grid : Message : 165.399438 s : *******************************************

View File

@ -0,0 +1,38 @@
#!/bin/bash -l
#SBATCH --job-name=bench
##SBATCH --partition=small-g
##SBATCH -q debug
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=7
#SBATCH --gpus-per-node=8
#SBATCH --time=00:30:00
#SBATCH --account=phy157_dwf
#SBATCH --gpu-bind=none
#SBATCH --exclusive
#SBATCH --mem=0
cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3 7 6 5 4)
export NUMA_MAP=(3 3 1 1 2 2 0 0)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./select_gpu
root=$HOME/Frontier/Grid/systems/Frontier/
source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol > Benchmark_usqcd.log

View File

@ -3,7 +3,7 @@ spack load c-lime
module load emacs
module load PrgEnv-gnu
module load rocm
module load cray-mpich/8.1.23
module load cray-mpich
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a

View File

@ -2,11 +2,11 @@
--enable-comms=mpi \
--enable-simd=GPU \
--enable-shm=nvlink \
--enable-gen-simd-width=64 \
--enable-accelerator=cuda \
--enable-gen-simd-width=64 \
--disable-gparity \
--with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
--enable-accelerator-cshift \
--disable-unified \
CXX=nvcc \
LDFLAGS="-cudart shared " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
LDFLAGS="-cudart shared -lcublas " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611"

View File

@ -1,6 +1,7 @@
module load cuda/11.4.1 openmpi/4.1.1-cuda11.4.1 ucx/1.12.0-cuda11.4.1
#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
export PREFIX=/home/tc002/tc002/shared/env/prefix/
export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
module load cuda/12.3
module load ucx/1.15.0-cuda12.3
module load openmpi/4.1.5-cuda12.3
source /home/dp207/dp207/shared/env/production/env-base.sh
source /home/dp207/dp207/shared/env/production/env-gpu.sh
unset SBATCH_EXPORT

View File

@ -142,7 +142,9 @@ int main (int argc, char ** argv)
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
int iters;
time_t now;
time_t start = time(NULL);
UGrid->Broadcast(0,(void *)&start,sizeof(start));
FlightRecorder::ContinueOnFail = 0;
FlightRecorder::PrintEntireLog = 0;
@ -162,9 +164,9 @@ int main (int argc, char ** argv)
}
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
result_o = Zero();
t1=usecond();
t1=usecond();
mCG(src_o,result_o);
t2=usecond();
t2=usecond();
iters = mCG.TotalInnerIterations; //Number of inner CG iterations
flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
flops+= CGsiteflops*FrbGrid->gSites()*iters;
@ -176,7 +178,8 @@ int main (int argc, char ** argv)
std::cout << " FlightRecorder is OK! "<<std::endl;
iter ++;
} while (time(NULL) < (start + nsecs/10) );
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
} while (now < (start + nsecs/10) );
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
@ -189,7 +192,7 @@ int main (int argc, char ** argv)
}
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
result_o_2 = Zero();
t1=usecond();
t1=usecond();
CG(HermOpEO,src_o,result_o_2);
t2=usecond();
iters = CG.IterationsToComplete;
@ -201,8 +204,9 @@ int main (int argc, char ** argv)
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
assert(FlightRecorder::ErrorCount()==0);
std::cout << " FlightRecorder is OK! "<<std::endl;
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
i++;
} while (time(NULL) < (start + nsecs) );
} while (now < (start + nsecs) );
LatticeFermionD diff_o(FrbGrid);
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);

118
tests/debug/Test_8888.cc Normal file
View File

@ -0,0 +1,118 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_general_coarse_hdcg.cc
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=8;
const int nbasis = 40;
const int cb = 0 ;
RealD mass=0.01;
RealD M5=1.8;
RealD b=1.0;
RealD c=0.0;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
///////////////////////// RNGs /////////////////////////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::vector<int> cseeds({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
///////////////////////// Configuration /////////////////////////////////
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("ckpoint_EODWF_lat.125");
NerscIO::readConfiguration(Umu,header,file);
//////////////////////// Fermion action //////////////////////////////////
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf);
std::cout << "**************************************"<<std::endl;
std::cout << " Fine Power method "<<std::endl;
std::cout << "**************************************"<<std::endl;
LatticeFermionD pm_src(FGrid);
pm_src = ComplexD(1.0);
PowerMethod<LatticeFermionD> fPM;
fPM(HermOp,pm_src);
std::cout << "**************************************"<<std::endl;
std::cout << " Fine Lanczos (poly, low) "<<std::endl;
std::cout << "**************************************"<<std::endl;
int Nk=80;
int Nm=Nk*3;
int Nstop=8;
int Nconv_test_interval=1;
// Chebyshev<LatticeFermionD> IRLChebyLo(0.2,64.0,201); // 1 iter
Chebyshev<LatticeFermionD> IRLChebyLo(0.0,55.0,101); // 1 iter
FunctionHermOp<LatticeFermionD> PolyOp(IRLChebyLo,HermOp);
PlainHermOp<LatticeFermionD> Op(HermOp);
ImplicitlyRestartedLanczos IRL(PolyOp,
Op,
Nk, // sought vecs
Nk, // sought vecs
Nm, // spare vecs
1.0e-8,
10 // Max iterations
);
int Nconv;
std::vector<RealD> eval(Nm);
std::vector<LatticeFermionD> evec(Nm,FGrid);
LatticeFermionD irl_src(FGrid);
IRL.calc(eval,evec,irl_src,Nconv);
Grid_finalize();
return 0;
}

View File

@ -392,9 +392,27 @@ void TestCGschur(What & Ddwf,
GridParallelRNG *RNG5)
{
LatticeFermion src (FGrid); random(*RNG5,src);
LatticeFermion result(FGrid); result=Zero();
LatticeFermion result1(FGrid); result1=Zero();
LatticeFermion result2(FGrid); result2=Zero();
LatticeFermion result3(FGrid); result3=Zero();
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
SchurSolver(Ddwf,src,result);
SchurSolver(Ddwf,src,result1);
SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
SchurSolverSymm1(Ddwf,src,result2);
SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
SchurSolverSymm2(Ddwf,src,result3);
std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
std::cout << GridLogMessage << " Symm1 " <<norm2(result2)<<std::endl;
result2=result2-result1;
std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl;
std::cout << GridLogMessage << " Symm2 " <<norm2(result3)<<std::endl;
result3=result3-result1;
std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl;
}

View File

@ -244,7 +244,7 @@ int main (int argc, char ** argv)
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
#if 0
MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
@ -307,7 +307,8 @@ int main (int argc, char ** argv)
rh_res= Zero();
mrhsCG(MrhsCoarseOp,rh_src,rh_res);
}
#endif
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov>
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
#include <Grid/algorithms/iterative/AdefGeneric.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
using namespace std;
using namespace Grid;
template<class Coarsened>
void SaveOperator(Coarsened &Operator,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Operator.Grid()->IsBoss());
assert(Operator._A.size()==Operator.geom.npoint);
WR.open(file);
for(int p=0;p<Operator._A.size();p++){
auto tmp = Operator.Cell.Extract(Operator._A[p]);
WR.writeScidacFieldRecord(tmp,record);
}
WR.close();
#endif
}
template<class Coarsened>
void LoadOperator(Coarsened &Operator,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
Grid::ScidacReader RD ;
RD.open(file);
assert(Operator._A.size()==Operator.geom.npoint);
for(int p=0;p<Operator.geom.npoint;p++){
conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
RD.readScidacFieldRecord(Operator._A[p],record);
}
RD.close();
Operator.ExchangeCoarseLinks();
#endif
}
template<class aggregation>
void SaveBasis(aggregation &Agg,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Agg.FineGrid->IsBoss());
WR.open(file);
for(int b=0;b<Agg.subspace.size();b++){
WR.writeScidacFieldRecord(Agg.subspace[b],record);
}
WR.close();
#endif
}
template<class aggregation>
void LoadBasis(aggregation &Agg, std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
RD.open(file);
for(int b=0;b<Agg.subspace.size();b++){
RD.readScidacFieldRecord(Agg.subspace[b],record);
}
RD.close();
#endif
}
template<class Field> class TestSolver : public LinearFunction<Field> {
public:
TestSolver() {};
void operator() (const Field &in, Field &out){ out = Zero(); }
};
RealD InverseApproximation(RealD x){
return 1.0/x;
}
// Want Op in CoarsenOp to call MatPcDagMatPc
template<class Field>
class HermOpAdaptor : public LinearOperatorBase<Field>
@ -119,33 +48,37 @@ public:
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
};
template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
template<class Field> class CGSmoother : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _SmootherOperator;
Chebyshev<Field> Cheby;
ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
int iters;
CGSmoother(int _iters, FineOperator &SmootherOperator) :
_SmootherOperator(SmootherOperator),
Cheby(_lo,_hi,_ord,InverseApproximation)
iters(_iters)
{
std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
};
void operator() (const Field &in, Field &out)
{
Field tmp(in.Grid());
tmp = in;
Cheby(_SmootherOperator,tmp,out);
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
out=Zero();
CG(_SmootherOperator,in,out);
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=24;
const int nbasis = 40;
const int nbasis = 60;
const int cb = 0 ;
RealD mass=0.00078;
RealD M5=1.8;
@ -160,10 +93,12 @@ int main (int argc, char ** argv)
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
// Construct a coarsened grid with 4^4 cell
Coordinate Block({4,4,4,4});
Coordinate clatt = GridDefaultLatt();
for(int d=0;d<clatt.size();d++){
clatt[d] = clatt[d]/4;
clatt[d] = clatt[d]/Block[d];
}
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());;
@ -182,7 +117,7 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("ckpoint_lat.4000");
std::string file("ckpoint_EODWF_lat.125");
NerscIO::readConfiguration(Umu,header,file);
//////////////////////// Fermion action //////////////////////////////////
@ -192,15 +127,7 @@ int main (int argc, char ** argv)
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
HermFineMatrix FineHermOp(HermOpEO);
LatticeFermion result(FrbGrid); result=Zero();
LatticeFermion src(FrbGrid); random(RNG5,src);
// Run power method on FineHermOp
PowerMethod<LatticeFermion> PM; PM(HermOpEO,src);
////////////////////////////////////////////////////////////
///////////// Coarse basis and Little Dirac Operator ///////
////////////////////////////////////////////////////////////
@ -208,219 +135,170 @@ int main (int argc, char ** argv)
typedef LittleDiracOperator::CoarseVector CoarseVector;
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
NearestStencilGeometry5D geom_nn(Coarse5d);
// Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
Subspace Aggregates(Coarse5d,FrbGrid,cb);
////////////////////////////////////////////////////////////
// Need to check about red-black grid coarsening
////////////////////////////////////////////////////////////
LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
bool load=false;
if ( load ) {
LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
} else {
Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
95.0,0.1,
// 400,200,200 -- 48 iters
// 600,200,200 -- 38 iters, 162s
// 600,200,100 -- 38 iters, 169s
// 600,200,50 -- 88 iters. 370s
800,
200,
100,
0.0);
LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
}
// Try projecting to one hop only
LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
int refine=1;
// Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
// 0.0003,1.0e-5,2000); // Lo, tol, maxit
// Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
std::cout << "**************************************"<<std::endl;
std::cout << "Create Subspace"<<std::endl;
std::cout << "**************************************"<<std::endl;
Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);
typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
HermMatrix CoarseOp (LittleDiracOp);
HermMatrix CoarseOpProj (LittleDiracOpProj);
std::cout << "**************************************"<<std::endl;
std::cout << "Refine Subspace"<<std::endl;
std::cout << "**************************************"<<std::endl;
Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
//////////////////////////////////////////
// Build a coarse lanczos
//////////////////////////////////////////
Chebyshev<CoarseVector> IRLCheby(0.2,40.0,71); // 1 iter
FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
PlainHermOp<CoarseVector> IRLOp (CoarseOp);
int Nk=48;
int Nm=64;
std::cout << "**************************************"<<std::endl;
std::cout << "Coarsen after refine"<<std::endl;
std::cout << "**************************************"<<std::endl;
Aggregates.Orthogonalise();
std::cout << "**************************************"<<std::endl;
std::cout << "Building MultiRHS Coarse operator"<<std::endl;
std::cout << "**************************************"<<std::endl;
ConjugateGradient<CoarseVector> coarseCG(4.0e-2,20000,true);
const int nrhs=12;
Coordinate mpi=GridDefaultMpi();
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
std::cout << "**************************************"<<std::endl;
std::cout << " Coarse Lanczos "<<std::endl;
std::cout << "**************************************"<<std::endl;
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
Chebyshev<CoarseVector> IRLCheby(0.01,42.0,301); // 1 iter
MrhsHermMatrix MrhsCoarseOp (mrhs);
CoarseVector pm_src(CoarseMrhs);
pm_src = ComplexD(1.0);
PowerMethod<CoarseVector> cPM;
cPM(MrhsCoarseOp,pm_src);
int Nk=192;
int Nm=384;
int Nstop=Nk;
ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
int Nconv_test_interval=1;
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
Coarse5d,
CoarseMrhs,
nrhs,
IRLCheby,
Nstop,
Nconv_test_interval,
nrhs,
Nk,
Nm,
1e-5,10);
int Nconv;
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm,Coarse5d);
CoarseVector c_src(Coarse5d);
//c_src=1.0;
random(CRNG,c_src);
CoarseVector c_res(Coarse5d);
CoarseVector c_ref(Coarse5d);
PowerMethod<CoarseVector> cPM; cPM(CoarseOp,c_src);
IRL.calc(eval,evec,c_src,Nconv);
DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
std::vector<CoarseVector> c_src(nrhs,Coarse5d);
//////////////////////////////////////////
// Build a coarse space solver
// Block projector for coarse/fine
//////////////////////////////////////////
int maxit=20000;
ConjugateGradient<CoarseVector> CG(1.0e-8,maxit,false);
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,10000,false);
ZeroGuesser<CoarseVector> CoarseZeroGuesser;
// HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
c_res=Zero();
HPDSolve(c_src,c_res); c_ref = c_res;
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
//////////////////////////////////////////////////////////////////////////
// Deflated (with real op EV's) solve for the projected coarse op
// Work towards ADEF1 in the coarse space
//////////////////////////////////////////////////////////////////////////
HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
c_res=Zero();
HPDSolveProj(c_src,c_res);
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
c_res = c_res - c_ref;
std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
std::cout << "**************************************"<<std::endl;
std::cout << "Calling mRHS HDCG"<<std::endl;
std::cout << "**************************************"<<std::endl;
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
MrhsProjector.ImportBasis(Aggregates.subspace);
//////////////////////////////////////////////////////////////////////
// Coarse ADEF1 with deflation space
//////////////////////////////////////////////////////////////////////
ChebyshevSmoother<CoarseVector,HermMatrix >
CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence
// CoarseSmoother(0.1,37.,8,CoarseOpProj); //
// CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s
// CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s
// CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
// CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
// ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj); // 311
////////////////////////////////////////////////////////
// CG, Cheby mode spacing 200,200
// Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
// Unprojected Coarse CG solve to 4e-2 : 33 iters, 0.8s
// Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
////////////////////////////////////////////////////////
// CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs
////////////////////////////////////////////////////////
// ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s 2.1x gain
// ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
// HDCG 38 iters 162s
//
// CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs
// ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s 2.1x gain
// ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
// HDCG 38 iters 169s
TwoLevelADEF1defl<CoarseVector>
cADEF1(1.0e-8, 500,
CoarseOp,
CoarseSmoother,
evec,eval);
c_res=Zero();
cADEF1(c_src,c_res);
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
c_res = c_res - c_ref;
std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
// cADEF1.Tolerance = 4.0e-2;
// cADEF1.Tolerance = 1.0e-1;
cADEF1.Tolerance = 5.0e-2;
c_res=Zero();
cADEF1(c_src,c_res);
std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
c_res = c_res - c_ref;
std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
//////////////////////////////////////////
// Build a smoother
//////////////////////////////////////////
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp); //383
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp); //328
// std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
// std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
// std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
// std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
//
// Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
// Need to measure cost of coarse space.
//
// -- i) Reduce coarse residual -- 0.04
// -- ii) Lanczos on coarse space -- done
// -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
// use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
//
std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
// std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
for(int l=0;l<los.size();l++){
RealD lo = los[l];
for(int o=0;o<ords.size();o++){
ConjugateGradient<CoarseVector> CGsloppy(4.0e-2,maxit,false);
HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
// ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp); // 311
//////////////////////////////////////////
// Build a HDCG solver
//////////////////////////////////////////
TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
HDCG(1.0e-8, 100,
FineHermOp,
Smoother,
HPDSolveSloppy,
HPDSolve,
Aggregates);
TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
HDCGdefl(1.0e-8, 100,
FineHermOp,
Smoother,
cADEF1,
HPDSolve,
Aggregates);
result=Zero();
HDCGdefl(src,result);
result=Zero();
HDCG(src,result);
}
std::cout << "**************************************"<<std::endl;
std::cout << " Recompute coarse evecs "<<std::endl;
std::cout << "**************************************"<<std::endl;
evec.resize(Nm,Coarse5d);
eval.resize(Nm);
for(int r=0;r<nrhs;r++){
random(CRNG,c_src[r]);
}
// Standard CG
result=Zero();
CGfine(HermOpEO, src, result);
IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
///////////////////////
// Deflation guesser object
///////////////////////
std::cout << "**************************************"<<std::endl;
std::cout << " Reimport coarse evecs "<<std::endl;
std::cout << "**************************************"<<std::endl;
MultiRHSDeflation<CoarseVector> MrhsGuesser;
MrhsGuesser.ImportEigenBasis(evec,eval);
//////////////////////////
// Extra HDCG parameters
//////////////////////////
int maxit=3000;
ConjugateGradient<CoarseVector> CG(5.0e-2,maxit,false);
RealD lo=2.0;
int ord = 7;
DoNothingGuesser<CoarseVector> DoNothing;
HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
/////////////////////////////////////////////////
// Mirs smoother
/////////////////////////////////////////////////
RealD MirsShift = lo;
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
HDCGmrhs(1.0e-8, 500,
FineHermOp,
CGsmooth,
HPDSolveMrhs, // Used in M1
HPDSolveMrhs, // Used in Vstart
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
for(int r=0;r<nrhs;r++){
random(RNG5,src_mrhs[r]);
res_mrhs[r]=Zero();
}
HDCGmrhs(src_mrhs,res_mrhs);
// Standard CG
#if 1
{
std::cout << "**************************************"<<std::endl;
std::cout << "Calling red black CG"<<std::endl;
std::cout << "**************************************"<<std::endl;
LatticeFermion result(FrbGrid); result=Zero();
LatticeFermion src(FrbGrid); random(RNG5,src);
result=Zero();
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
CGfine(HermOpEO, src, result);
}
#endif
Grid_finalize();
return 0;
}

View File

@ -145,7 +145,7 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
const int Ls=24;
const int nbasis = 60;
const int nbasis = 62;
const int cb = 0 ;
RealD mass=0.00078;
RealD M5=1.8;
@ -160,7 +160,7 @@ int main (int argc, char ** argv)
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
// Construct a coarsened grid with 4^4 cell
Coordinate Block({4,4,4,4});
Coordinate Block({4,4,6,4});
Coordinate clatt = GridDefaultLatt();
for(int d=0;d<clatt.size();d++){
clatt[d] = clatt[d]/Block[d];

View File

@ -0,0 +1,396 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_general_coarse_hdcg.cc
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
using namespace std;
using namespace Grid;
template<class aggregation>
void SaveBasis(aggregation &Agg,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Agg.FineGrid->IsBoss());
WR.open(file);
for(int b=0;b<Agg.subspace.size();b++){
WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
// WR.writeScidacFieldRecord(Agg.subspace[b],record);
}
WR.close();
#endif
}
template<class aggregation>
void LoadBasis(aggregation &Agg, std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
RD.open(file);
for(int b=0;b<Agg.subspace.size();b++){
RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
}
RD.close();
#endif
}
template<class CoarseVector>
void SaveEigenvectors(std::vector<RealD> &eval,
std::vector<CoarseVector> &evec,
std::string evec_file,
std::string eval_file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(evec[0].Grid()->IsBoss());
WR.open(evec_file);
for(int b=0;b<evec.size();b++){
WR.writeScidacFieldRecord(evec[b],record,0,0);
}
WR.close();
XmlWriter WRx(eval_file);
write(WRx,"evals",eval);
#endif
}
template<class CoarseVector>
void LoadEigenvectors(std::vector<RealD> &eval,
std::vector<CoarseVector> &evec,
std::string evec_file,
std::string eval_file)
{
#ifdef HAVE_LIME
XmlReader RDx(eval_file);
read(RDx,"evals",eval);
emptyUserRecord record;
Grid::ScidacReader RD ;
RD.open(evec_file);
assert(evec.size()==eval.size());
for(int k=0;k<eval.size();k++) {
RD.readScidacFieldRecord(evec[k],record);
}
RD.close();
#endif
}
// Want Op in CoarsenOp to call MatPcDagMatPc
template<class Field>
class HermOpAdaptor : public LinearOperatorBase<Field>
{
LinearOperatorBase<Field> & wrapped;
public:
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); }
void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); }
void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); }
void OpDiag (const Field &in, Field &out) { assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
};
template<class Field> class CGSmoother : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _SmootherOperator;
int iters;
CGSmoother(int _iters, FineOperator &SmootherOperator) :
_SmootherOperator(SmootherOperator),
iters(_iters)
{
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
};
void operator() (const Field &in, Field &out)
{
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
out=Zero();
CG(_SmootherOperator,in,out);
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=24;
const int nbasis = 60;
const int cb = 0 ;
RealD mass=0.00078;
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
// Construct a coarsened grid with 4^4 cell
// Coordinate Block({4,4,6,4});
Coordinate Block({4,4,4,4});
Coordinate clatt = GridDefaultLatt();
for(int d=0;d<clatt.size();d++){
clatt[d] = clatt[d]/Block[d];
}
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());;
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
///////////////////////// RNGs /////////////////////////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::vector<int> cseeds({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
///////////////////////// Configuration /////////////////////////////////
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250");
NerscIO::readConfiguration(Umu,header,file);
//////////////////////// Fermion action //////////////////////////////////
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
HermFineMatrix FineHermOp(HermOpEO);
////////////////////////////////////////////////////////////
///////////// Coarse basis and Little Dirac Operator ///////
////////////////////////////////////////////////////////////
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
typedef LittleDiracOperator::CoarseVector CoarseVector;
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
Subspace Aggregates(Coarse5d,FrbGrid,cb);
////////////////////////////////////////////////////////////
// Need to check about red-black grid coarsening
////////////////////////////////////////////////////////////
// std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60");
std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60");
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2");
std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60");
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
bool load_agg=true;
bool load_refine=true;
bool load_mat=false;
bool load_evec=false;
int refine=1;
if ( load_agg ) {
if ( !(refine) || (!load_refine) ) {
LoadBasis(Aggregates,subspace_file);
}
} else {
Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);
SaveBasis(Aggregates,subspace_file);
}
if ( load_refine ) {
std::cout << " Load Refine "<< refine_file <<std::endl;
LoadBasis(Aggregates,refine_file);
} else {
Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters
// Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters
SaveBasis(Aggregates,refine_file);
}
Aggregates.Orthogonalise();
std::cout << "**************************************"<<std::endl;
std::cout << "Building MultiRHS Coarse operator"<<std::endl;
std::cout << "**************************************"<<std::endl;
const int nrhs=12;
Coordinate mpi=GridDefaultMpi();
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
///////////////////////
// Deflation guesser object
///////////////////////
MultiRHSDeflation<CoarseVector> MrhsGuesser;
//////////////////////////////////////////
// Block projector for coarse/fine
//////////////////////////////////////////
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
std::cout << "**************************************"<<std::endl;
std::cout << "Coarsen after refine"<<std::endl;
std::cout << "**************************************"<<std::endl;
mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
std::cout << "**************************************"<<std::endl;
std::cout << " Coarse Lanczos "<<std::endl;
std::cout << "**************************************"<<std::endl;
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
// Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,301); // 4.4.6.4
// Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,501); // for 4.4.4.4 blocking 350 evs
// Chebyshev<CoarseVector> IRLCheby(0.0014,42.0,501); // for 4.4.4.4 blocking 700 evs
// Chebyshev<CoarseVector> IRLCheby(0.002,42.0,501); // for 4.4.4.4 blocking 1226 evs
// Chebyshev<CoarseVector> IRLCheby(0.0025,42.0,501); // for 4.4.4.4 blocking 1059 evs
// 3e-4,2);
Chebyshev<CoarseVector> IRLCheby(0.0018,42.0,301); // for 4.4.4.4 blocking // 790 evs
MrhsHermMatrix MrhsCoarseOp (mrhs);
CoarseVector pm_src(CoarseMrhs);
pm_src = ComplexD(1.0);
PowerMethod<CoarseVector> cPM; cPM(MrhsCoarseOp,pm_src);
// int Nk=nrhs*30; // 4.4.6.4
// int Nk=nrhs*80;
int Nk=nrhs*60; // 720
int Nm=Nk*4; // 2880 ; generally finishes at 1440
int Nstop=512;
int Nconv_test_interval=1;
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
Coarse5d,
CoarseMrhs,
nrhs,
IRLCheby,
Nstop,
Nconv_test_interval,
nrhs,
Nk,
Nm,
3e-4,2);
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm,Coarse5d);
std::vector<CoarseVector> c_src(nrhs,Coarse5d);
std::cout << "**************************************"<<std::endl;
std::cout << " Recompute coarse evecs "<<std::endl;
std::cout << "**************************************"<<std::endl;
evec.resize(Nm,Coarse5d);
eval.resize(Nm);
for(int r=0;r<nrhs;r++){
random(CRNG,c_src[r]);
}
int Nconv;
IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl);
Nconv = eval.size();
std::cout << "**************************************"<<std::endl;
std::cout << " import coarse evecs "<<std::endl;
std::cout << "**************************************"<<std::endl;
MrhsGuesser.ImportEigenBasis(evec,eval);
std::cout << "**************************************"<<std::endl;
std::cout << "Calling mRHS HDCG"<<std::endl;
std::cout << "**************************************"<<std::endl;
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
MrhsProjector.ImportBasis(Aggregates.subspace);
//////////////////////////
// Extra HDCG parameters
//////////////////////////
int maxit=3000;
ConjugateGradient<CoarseVector> CG(7.5e-2,maxit,false);
RealD lo=2.0;
int ord = 7;
DoNothingGuesser<CoarseVector> DoNothing;
HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing);
/////////////////////////////////////////////////
// Mirs smoother
/////////////////////////////////////////////////
RealD MirsShift = lo;
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
HDCGmrhs(1.0e-8, 500,
FineHermOp,
CGsmooth,
HPDSolveMrhs, // Used in M1
HPDSolveMrhs, // Used in Vstart
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
for(int r=0;r<nrhs;r++){
random(RNG5,src_mrhs[r]);
res_mrhs[r]=Zero();
}
HDCGmrhs(src_mrhs,res_mrhs);
// Standard CG
#if 0
{
std::cout << "**************************************"<<std::endl;
std::cout << "Calling red black CG"<<std::endl;
std::cout << "**************************************"<<std::endl;
LatticeFermion result(FrbGrid); result=Zero();
LatticeFermion src(FrbGrid); random(RNG5,src);
result=Zero();
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
CGfine(HermOpEO, src, result);
}
#endif
Grid_finalize();
return 0;
}