mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-14 13:57:07 +01:00
Compare commits
45 Commits
fb9b1d76ca
...
feature/ft
Author | SHA1 | Date | |
---|---|---|---|
bffd30abec | |||
da919949f9 | |||
b12b4fdaff | |||
557fa483ff | |||
fc15d55df6 | |||
53573d7d94 | |||
bb3c177000 | |||
a3322b470f | |||
f8f408e7a9 | |||
baac1127d0 | |||
6f1328160c | |||
04cf902791 | |||
7a5b1c1a19 | |||
18d2d7da4a | |||
b461184797 | |||
4563b39305 | |||
c9d5674d5b | |||
486412635a | |||
8b23a1546a | |||
a901e4e369 | |||
804d9367d4 | |||
41d8adca95 | |||
059e8e5bb0 | |||
b3ee8ded96 | |||
cf3584ad15 | |||
a66973163f | |||
4502a8c8a1 | |||
9c902e4c2d | |||
f3eb36adcf | |||
7c246606c1 | |||
172c75029e | |||
6ae52da571 | |||
4ee9c68053 | |||
a15b4378a3 | |||
89fdd7f8dd | |||
c328be24b7 | |||
a73dc6dbf4 | |||
eee2a2657f | |||
12b8be7cb9 | |||
63c223ea5d | |||
2877fb4a2c | |||
d299c86633 | |||
6ce52092e8 | |||
b5926c1d21 | |||
9563238e9b |
1052
BLAS_benchmark/BatchBlasBench.cc
Normal file
1052
BLAS_benchmark/BatchBlasBench.cc
Normal file
File diff suppressed because it is too large
Load Diff
2
BLAS_benchmark/compile-command
Normal file
2
BLAS_benchmark/compile-command
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
@ -30,9 +30,14 @@ directory
|
|||||||
|
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <exception>
|
||||||
|
|
||||||
#define NAMESPACE_BEGIN(A) namespace A {
|
#define NAMESPACE_BEGIN(A) namespace A {
|
||||||
#define NAMESPACE_END(A) }
|
#define NAMESPACE_END(A) }
|
||||||
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
|
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
|
||||||
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
|
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
|
||||||
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
|
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
|
||||||
|
|
||||||
|
#define EXCEPTION_CHECK_BEGIN(A) try {
|
||||||
|
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
|
||||||
|
|
||||||
|
@ -89,9 +89,10 @@ public:
|
|||||||
gridblasHandle = theGridAccelerator;
|
gridblasHandle = theGridAccelerator;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_ONE_MKL
|
#ifdef GRID_ONE_MKL
|
||||||
cl::sycl::cpu_selector selector;
|
cl::sycl::gpu_selector selector;
|
||||||
cl::sycl::device selectedDevice { selector };
|
cl::sycl::device selectedDevice { selector };
|
||||||
gridblasHandle =new sycl::queue (selectedDevice);
|
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
||||||
|
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||||
#endif
|
#endif
|
||||||
gridblasInit=1;
|
gridblasInit=1;
|
||||||
}
|
}
|
||||||
@ -207,6 +208,9 @@ public:
|
|||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -266,26 +270,130 @@ public:
|
|||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
int64_t m64=m;
|
||||||
#warning "oneMKL implementation not built "
|
int64_t n64=n;
|
||||||
#endif
|
int64_t k64=k;
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
int64_t lda64=lda;
|
||||||
// Need a default/reference implementation
|
int64_t ldb64=ldb;
|
||||||
int sda = lda*k;
|
int64_t ldc64=ldc;
|
||||||
int sdb = ldb*k;
|
int64_t batchCount64=batchCount;
|
||||||
int sdc = ldc*n;
|
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(ComplexD *) &alpha_p[0],
|
||||||
|
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(ComplexD *) &beta_p[0],
|
||||||
|
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
|
#if 0
|
||||||
|
// This code was used to check the mat mul on Sunspot/OneMKL
|
||||||
|
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
|
||||||
|
std::vector<ComplexD> A(m*k); // pointer list to matrices
|
||||||
|
std::vector<ComplexD> B(k*n);
|
||||||
|
std::vector<ComplexD> C(m*n);
|
||||||
|
// int sda = lda*k;
|
||||||
|
// int sdb = ldb*k;
|
||||||
|
// int sdc = ldc*n;
|
||||||
|
std::cerr << " Checking the GEMM results "<<std::endl;
|
||||||
|
for (int p = 0; p < 1; ++p) {
|
||||||
|
ComplexD * Amk_p; // pointer list to matrices
|
||||||
|
ComplexD * Bkn_p; // pointer list to matrices
|
||||||
|
ComplexD * Cmn_p; // pointer list to matrices
|
||||||
|
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
|
||||||
|
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
|
||||||
|
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
|
||||||
|
std::cerr << " p " << p << " copied pointers "<<std::endl;
|
||||||
|
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
|
||||||
|
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
|
||||||
|
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
|
||||||
|
std::cerr << " p " << p << " copied matrices "<<std::endl;
|
||||||
|
std::cerr << " C[0] "<<C[0]<<std::endl;
|
||||||
|
std::cerr << " A[0] "<<A[0]<<std::endl;
|
||||||
|
std::cerr << " B[0] "<<B[0]<<std::endl;
|
||||||
|
std::cerr << " m "<<m<<std::endl;
|
||||||
|
std::cerr << " n "<<n<<std::endl;
|
||||||
|
std::cerr << " k "<<k<<std::endl;
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
for (int nn = 0; nn < n; ++nn) {
|
||||||
ComplexD c_mn(0.0);
|
ComplexD c_mn(0.0);
|
||||||
for (int kk = 0; kk < k; ++kk)
|
for (int kk = 0; kk < k; ++kk) {
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
int idx_a, idx_b;
|
||||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
// int lda = m; // m x k column major
|
||||||
|
// int ldb = k; // k x n column major
|
||||||
|
// int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N) {
|
||||||
|
idx_a =kk + mm*lda;
|
||||||
|
} else {
|
||||||
|
idx_a =mm + kk*lda;
|
||||||
|
}
|
||||||
|
if(OpB!=GridBLAS_OP_N) {
|
||||||
|
idx_b =nn + kk*ldb;
|
||||||
|
} else {
|
||||||
|
idx_b =kk + nn*ldb;
|
||||||
|
}
|
||||||
|
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
|
||||||
|
|
||||||
|
ComplexD Ac = A[idx_a];
|
||||||
|
ComplexD Bc = B[idx_b];
|
||||||
|
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
|
||||||
|
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
|
||||||
|
|
||||||
|
c_mn += Ac*Bc;
|
||||||
|
}
|
||||||
|
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// synchronise();
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
||||||
@ -306,6 +414,9 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -366,25 +477,68 @@ public:
|
|||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
int64_t m64=m;
|
||||||
#warning "oneMKL implementation not built "
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(ComplexF *) &alpha_p[0],
|
||||||
|
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(ComplexF *) &beta_p[0],
|
||||||
|
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
int sda = lda*k;
|
// Need a default/reference implementation; use Eigen
|
||||||
int sdb = ldb*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdc = ldc*n;
|
thread_for (p, batchCount, {
|
||||||
ComplexF alphaf(real(alpha),imag(alpha));
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
ComplexF betaf(real(beta),imag(beta));
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
// Need a default/reference implementation
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
});
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
ComplexF c_mn(0.0);
|
thread_for (p, batchCount, {
|
||||||
for (int kk = 0; kk < k; ++kk)
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
}
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
}
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
@ -408,6 +562,9 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -467,23 +624,68 @@ public:
|
|||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
int64_t m64=m;
|
||||||
#warning "oneMKL implementation not built "
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(const float **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const float **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
int sda = lda*k;
|
// Need a default/reference implementation; use Eigen
|
||||||
int sdb = ldb*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdc = ldc*n;
|
thread_for (p, batchCount, {
|
||||||
// Need a default/reference implementation
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
RealD c_mn(0.0);
|
});
|
||||||
for (int kk = 0; kk < k; ++kk)
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
thread_for (p, batchCount, {
|
||||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
@ -495,7 +697,6 @@ public:
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Double precision real GEMM
|
// Double precision real GEMM
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
GridBLASOperation_t OpB,
|
GridBLASOperation_t OpB,
|
||||||
int m,int n, int k,
|
int m,int n, int k,
|
||||||
@ -508,6 +709,9 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
int ldc = m; // m x b column major
|
int ldc = m; // m x b column major
|
||||||
@ -568,39 +772,68 @@ public:
|
|||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
/*
|
|
||||||
int64_t m64=m;
|
int64_t m64=m;
|
||||||
int64_t n64=n;
|
int64_t n64=n;
|
||||||
int64_t k64=k;
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
int64_t batchCount64=batchCount;
|
int64_t batchCount64=batchCount;
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
|
|
||||||
onemkl::transpose::N,
|
oneapi::mkl::transpose iOpA;
|
||||||
onemkl::transpose::N,
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
&m64,&n64,&k64,
|
&m64,&n64,&k64,
|
||||||
(double *) &alpha_p[0],
|
(double *) &alpha_p[0],
|
||||||
(double **)&Amk[0], lda,
|
(const double **)&Amk[0], (const int64_t *)&lda64,
|
||||||
(double **)&Bkn[0], ldb,
|
(const double **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
(double *) &beta_p[0],
|
(double *) &beta_p[0],
|
||||||
(double **)&Cmn[0], ldc,
|
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
1,&batchCount64);
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
*/
|
synchronise();
|
||||||
//MKL’s cblas_<T>gemm_batch & OneAPI
|
|
||||||
#warning "oneMKL implementation not built "
|
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
int sda = lda*k;
|
// Need a default/reference implementation; use Eigen
|
||||||
int sdb = ldb*k;
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
int sdc = ldc*n;
|
thread_for (p, batchCount, {
|
||||||
// Need a default/reference implementation
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
RealD c_mn(0.0);
|
});
|
||||||
for (int kk = 0; kk < k; ++kk)
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
|
thread_for (p, batchCount, {
|
||||||
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
}
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
@ -608,120 +841,55 @@ public:
|
|||||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class CComplex>
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Strided case used by benchmark, but generally unused in Grid
|
|
||||||
// Keep a code example in double complex, but don't generate the single and real variants for now
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
void gemmStridedBatched(int m,int n, int k,
|
|
||||||
ComplexD alpha,
|
|
||||||
ComplexD* Amk, // pointer list to matrices
|
|
||||||
ComplexD* Bkn,
|
|
||||||
ComplexD beta,
|
|
||||||
ComplexD* Cmn,
|
|
||||||
int batchCount)
|
|
||||||
{
|
|
||||||
// Use C-row major storage, so transpose calls
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
int sda = m*k;
|
|
||||||
int sdb = k*n;
|
|
||||||
int sdc = m*n;
|
|
||||||
deviceVector<ComplexD> alpha_p(1);
|
|
||||||
deviceVector<ComplexD> beta_p(1);
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
|
||||||
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
|
||||||
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
|
||||||
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
|
||||||
HIPBLAS_OP_N,
|
|
||||||
HIPBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
(hipblasDoubleComplex *) &alpha_p[0],
|
|
||||||
(hipblasDoubleComplex *) Amk, lda, sda,
|
|
||||||
(hipblasDoubleComplex *) Bkn, ldb, sdb,
|
|
||||||
(hipblasDoubleComplex *) &beta_p[0],
|
|
||||||
(hipblasDoubleComplex *) Cmn, ldc, sdc,
|
|
||||||
batchCount);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasZgemmStridedBatched(gridblasHandle,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
CUBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
(cuDoubleComplex *) &alpha_p[0],
|
|
||||||
(cuDoubleComplex *) Amk, lda, sda,
|
|
||||||
(cuDoubleComplex *) Bkn, ldb, sdb,
|
|
||||||
(cuDoubleComplex *) &beta_p[0],
|
|
||||||
(cuDoubleComplex *) Cmn, ldc, sdc,
|
|
||||||
batchCount);
|
|
||||||
#endif
|
|
||||||
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
|
||||||
oneapi::mkl::transpose::N,
|
|
||||||
oneapi::mkl::transpose::N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
(const ComplexD *)Amk,lda,sda,
|
|
||||||
(const ComplexD *)Bkn,ldb,sdb,
|
|
||||||
beta,
|
|
||||||
(ComplexD *)Cmn,ldc,sdc,
|
|
||||||
batchCount);
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
|
||||||
// Need a default/reference implementation
|
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
|
||||||
for (int nn = 0; nn < n; ++nn) {
|
|
||||||
ComplexD c_mn(0.0);
|
|
||||||
for (int kk = 0; kk < k; ++kk)
|
|
||||||
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
|
|
||||||
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
double benchmark(int M, int N, int K, int BATCH)
|
double benchmark(int M, int N, int K, int BATCH)
|
||||||
{
|
{
|
||||||
int32_t N_A = M*K*BATCH;
|
int32_t N_A = M*K*BATCH;
|
||||||
int32_t N_B = K*N*BATCH;
|
int32_t N_B = K*N*BATCH;
|
||||||
int32_t N_C = M*N*BATCH;
|
int32_t N_C = M*N*BATCH;
|
||||||
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
|
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
||||||
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
|
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
||||||
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
|
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
||||||
ComplexD alpha(1.0);
|
CComplex alpha(1.0);
|
||||||
ComplexD beta (1.0);
|
CComplex beta (1.0);
|
||||||
RealD flops = 8.0*M*N*K*BATCH;
|
RealD flops = 8.0*M*N*K*BATCH;
|
||||||
int ncall=10;
|
int ncall=1000;
|
||||||
|
deviceVector<CComplex *> As(BATCH);
|
||||||
|
deviceVector<CComplex *> Bs(BATCH);
|
||||||
|
deviceVector<CComplex *> Cs(BATCH);
|
||||||
|
for(int b = 0 ; b < BATCH;b++) {
|
||||||
|
CComplex *ptr;
|
||||||
|
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
|
||||||
|
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
|
||||||
|
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warm up call
|
||||||
|
gemmBatched(M,N,K,
|
||||||
|
alpha,
|
||||||
|
As, // m x k
|
||||||
|
Bs, // k x n
|
||||||
|
beta,
|
||||||
|
Cs);
|
||||||
|
synchronise();
|
||||||
|
|
||||||
RealD t0 = usecond();
|
RealD t0 = usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
gemmStridedBatched(M,N,K,
|
gemmBatched(M,N,K,
|
||||||
alpha,
|
alpha,
|
||||||
&A[0], // m x k
|
As, // m x k
|
||||||
&B[0], // k x n
|
Bs, // k x n
|
||||||
beta,
|
beta,
|
||||||
&C[0], // m x n
|
Cs);
|
||||||
BATCH);
|
|
||||||
}
|
|
||||||
synchronise();
|
synchronise();
|
||||||
|
}
|
||||||
RealD t1 = usecond();
|
RealD t1 = usecond();
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
|
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
|
||||||
flops = 8.0*M*N*K*BATCH*ncall;
|
flops = 8.0*M*N*K*BATCH*ncall;
|
||||||
flops = flops/(t1-t0)/1.e3;
|
flops = flops/(t1-t0)/1.e3;
|
||||||
return flops; // Returns gigaflops
|
return flops; // Returns gigaflops
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -279,11 +279,11 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nm);
|
_sort.push(eval2,Nm);
|
||||||
// Glog << "#Ritz value before shift: "<< std::endl;
|
Glog << "#Ritz value before shift: "<< std::endl;
|
||||||
for(int i=0; i<Nm; ++i){
|
for(int i=0; i<Nm; ++i){
|
||||||
// std::cout.precision(13);
|
std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@ -298,6 +298,7 @@ public:
|
|||||||
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
|
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
|
||||||
|
|
||||||
for(int ip=Nk; ip<Nm; ++ip){
|
for(int ip=Nk; ip<Nm; ++ip){
|
||||||
|
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
|
||||||
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
|
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,7 +326,7 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nk);
|
_sort.push(eval2,Nk);
|
||||||
// Glog << "#Ritz value after shift: "<< std::endl;
|
Glog << "#Ritz value after shift: "<< std::endl;
|
||||||
for(int i=0; i<Nk; ++i){
|
for(int i=0; i<Nk; ++i){
|
||||||
// std::cout.precision(13);
|
// std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
@ -467,10 +468,10 @@ public:
|
|||||||
|
|
||||||
// set initial vector
|
// set initial vector
|
||||||
for (int i=0; i<Nu; ++i) {
|
for (int i=0; i<Nu; ++i) {
|
||||||
// Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
|
Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
|
||||||
evec[i] = src[i];
|
evec[i] = src[i];
|
||||||
orthogonalize(evec[i],evec,i);
|
orthogonalize(evec[i],evec,i);
|
||||||
// Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
|
Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
|
||||||
}
|
}
|
||||||
// exit(-43);
|
// exit(-43);
|
||||||
|
|
||||||
@ -506,11 +507,11 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
|
Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
|
||||||
_sort.push(eval2,Nr);
|
_sort.push(eval2,Nr);
|
||||||
// Glog << "#Ritz value: "<< std::endl;
|
Glog << "#Ritz value: "<< std::endl;
|
||||||
for(int i=0; i<Nr; ++i){
|
for(int i=0; i<Nr; ++i){
|
||||||
// std::cout.precision(13);
|
std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convergence test
|
// Convergence test
|
||||||
@ -570,6 +571,7 @@ public:
|
|||||||
Glog << fname + " NOT converged ; Summary :\n";
|
Glog << fname + " NOT converged ; Summary :\n";
|
||||||
} else {
|
} else {
|
||||||
Glog << fname + " CONVERGED ; Summary :\n";
|
Glog << fname + " CONVERGED ; Summary :\n";
|
||||||
|
Nstop = Nconv_guess; // Just take them all
|
||||||
// Sort convered eigenpairs.
|
// Sort convered eigenpairs.
|
||||||
std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
|
std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
|
||||||
|
|
||||||
@ -642,7 +644,7 @@ private:
|
|||||||
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
||||||
k_start +=mrhs;
|
k_start +=mrhs;
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg "<< std::endl;
|
Glog << "LinAlg "<< std::endl;
|
||||||
|
|
||||||
if (b>0) {
|
if (b>0) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@ -676,7 +678,7 @@ private:
|
|||||||
}
|
}
|
||||||
w_copy[u] = w[u];
|
w_copy[u] = w[u];
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg done"<< std::endl;
|
Glog << "LinAlg done"<< std::endl;
|
||||||
|
|
||||||
// In block version, the steps 6 and 7 in Lanczos construction is
|
// In block version, the steps 6 and 7 in Lanczos construction is
|
||||||
// replaced by the QR decomposition of new basis block.
|
// replaced by the QR decomposition of new basis block.
|
||||||
@ -689,15 +691,15 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// re-orthogonalization for numerical stability
|
// re-orthogonalization for numerical stability
|
||||||
// Glog << "Gram Schmidt"<< std::endl;
|
Glog << "Gram Schmidt"<< std::endl;
|
||||||
orthogonalize(w,Nu,evec,R);
|
orthogonalize(w,Nu,evec,R);
|
||||||
// QR part
|
// QR part
|
||||||
for (int u=1; u<Nu; ++u) {
|
for (int u=1; u<Nu; ++u) {
|
||||||
orthogonalize(w[u],w,u);
|
orthogonalize(w[u],w,u);
|
||||||
}
|
}
|
||||||
// Glog << "Gram Schmidt done "<< std::endl;
|
Glog << "Gram Schmidt done "<< std::endl;
|
||||||
|
|
||||||
// Glog << "LinAlg "<< std::endl;
|
Glog << "LinAlg "<< std::endl;
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
//for (int v=0; v<Nu; ++v) {
|
//for (int v=0; v<Nu; ++v) {
|
||||||
for (int v=u; v<Nu; ++v) {
|
for (int v=u; v<Nu; ++v) {
|
||||||
@ -714,7 +716,7 @@ private:
|
|||||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg done "<< std::endl;
|
Glog << "LinAlg done "<< std::endl;
|
||||||
|
|
||||||
if (b < Nm/Nu-1) {
|
if (b < Nm/Nu-1) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@ -933,7 +935,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
||||||
@ -951,7 +953,7 @@ if (1){
|
|||||||
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;
|
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -961,7 +963,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
|
|
||||||
@ -977,7 +979,7 @@ if (1){
|
|||||||
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;
|
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -986,7 +988,7 @@ if (1){
|
|||||||
RealD Dsh,
|
RealD Dsh,
|
||||||
Eigen::MatrixXcd& Qprod)
|
Eigen::MatrixXcd& Qprod)
|
||||||
{
|
{
|
||||||
//Glog << "shiftedQRDecompEigen() begin" << '\n';
|
Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||||
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
@ -1002,6 +1004,7 @@ if (1){
|
|||||||
// lower triangular part used to represent series
|
// lower triangular part used to represent series
|
||||||
// of Q sequence.
|
// of Q sequence.
|
||||||
|
|
||||||
|
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||||
// equivalent operation of Qprod *= Q
|
// equivalent operation of Qprod *= Q
|
||||||
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
@ -1022,6 +1025,7 @@ if (1){
|
|||||||
|
|
||||||
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
|
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=0; j<Nm-(Nu+1); ++j) {
|
for (int j=0; j<Nm-(Nu+1); ++j) {
|
||||||
for (int k=0; k<Nu+1+j; ++k) {
|
for (int k=0; k<Nu+1+j; ++k) {
|
||||||
@ -1029,6 +1033,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
||||||
for (int k=0; k<Nm; ++k) {
|
for (int k=0; k<Nm; ++k) {
|
||||||
@ -1036,6 +1041,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||||
|
|
||||||
//static int ntimes = 2;
|
//static int ntimes = 2;
|
||||||
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
||||||
@ -1061,11 +1067,13 @@ if (1){
|
|||||||
Mtmp(j,i) = conj(Mtmp(i,j));
|
Mtmp(j,i) = conj(Mtmp(i,j));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||||
|
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||||
M = Mtmp;
|
M = Mtmp;
|
||||||
|
|
||||||
//M = Q.adjoint()*(M*Q);
|
//M = Q.adjoint()*(M*Q);
|
||||||
@ -1077,7 +1085,7 @@ if (1){
|
|||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//Glog << "shiftedQRDecompEigen() end" << endl;
|
Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void exampleQRDecompEigen(void)
|
void exampleQRDecompEigen(void)
|
||||||
|
@ -499,6 +499,87 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Site diagonal is identity, left preconditioned by Mee^inv
|
||||||
|
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
|
||||||
|
//
|
||||||
|
// Solve:
|
||||||
|
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
|
||||||
|
//
|
||||||
|
// Old notation e<->o
|
||||||
|
//
|
||||||
|
// Left precon by Moo^-1
|
||||||
|
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
|
||||||
|
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
|
||||||
|
public:
|
||||||
|
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
// Wrap the usual normal equations Schur trick
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
|
||||||
|
const bool _solnAsInitGuess = false)
|
||||||
|
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
|
||||||
|
|
||||||
|
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
|
||||||
|
{
|
||||||
|
GridBase *grid = _Matrix.RedBlackGrid();
|
||||||
|
GridBase *fgrid= _Matrix.Grid();
|
||||||
|
|
||||||
|
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
|
||||||
|
|
||||||
|
Field tmp(grid);
|
||||||
|
Field Mtmp(grid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd ,src_o,src);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
|
||||||
|
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
|
||||||
|
Mtmp=src_o-Mtmp;
|
||||||
|
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
|
||||||
|
|
||||||
|
// get the right MpcDag
|
||||||
|
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
|
||||||
|
{
|
||||||
|
GridBase *grid = _Matrix.RedBlackGrid();
|
||||||
|
GridBase *fgrid= _Matrix.Grid();
|
||||||
|
|
||||||
|
Field tmp(grid);
|
||||||
|
Field sol_e(grid);
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
|
||||||
|
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
|
||||||
|
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
|
||||||
|
|
||||||
|
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
|
||||||
|
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
|
||||||
|
};
|
||||||
|
|
||||||
|
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
|
||||||
|
{
|
||||||
|
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
|
||||||
|
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
|
||||||
|
};
|
||||||
|
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
|
||||||
|
{
|
||||||
|
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
|
||||||
|
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Site diagonal is identity, right preconditioned by Mee^inv
|
// Site diagonal is identity, right preconditioned by Mee^inv
|
||||||
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
|
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
|
||||||
|
@ -54,6 +54,9 @@ public:
|
|||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
profilerAllocate(bytes);
|
profilerAllocate(bytes);
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
||||||
|
if ( (_Tp*)ptr == (_Tp *) NULL ) {
|
||||||
|
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
|
||||||
|
}
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -100,6 +103,9 @@ public:
|
|||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
profilerAllocate(bytes);
|
profilerAllocate(bytes);
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
||||||
|
if ( (_Tp*)ptr == (_Tp *) NULL ) {
|
||||||
|
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
|
||||||
|
}
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -145,6 +151,9 @@ public:
|
|||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
profilerAllocate(bytes);
|
profilerAllocate(bytes);
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
||||||
|
if ( (_Tp*)ptr == (_Tp *) NULL ) {
|
||||||
|
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
|
||||||
|
}
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
uint64_t total_shared;
|
uint64_t total_shared;
|
||||||
uint64_t total_device;
|
uint64_t total_device;
|
||||||
uint64_t total_host;;
|
uint64_t total_host;;
|
||||||
|
|
||||||
|
#if defined(__has_feature)
|
||||||
|
#if __has_feature(leak_sanitizer)
|
||||||
|
#define ASAN_LEAK_CHECK
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ASAN_LEAK_CHECK
|
||||||
|
#include <sanitizer/asan_interface.h>
|
||||||
|
#include <sanitizer/common_interface_defs.h>
|
||||||
|
#include <sanitizer/lsan_interface.h>
|
||||||
|
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
|
||||||
|
#else
|
||||||
|
#define LEAK_CHECK(A) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void MemoryManager::DisplayMallinfo(void)
|
||||||
|
{
|
||||||
|
#ifdef __linux__
|
||||||
|
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
|
||||||
|
|
||||||
|
mi = mallinfo();
|
||||||
|
|
||||||
|
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
|
||||||
|
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
|
||||||
|
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
|
||||||
|
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
|
||||||
|
#endif
|
||||||
|
LEAK_CHECK();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
void MemoryManager::PrintBytes(void)
|
void MemoryManager::PrintBytes(void)
|
||||||
{
|
{
|
||||||
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
||||||
@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cuda_mem();
|
cuda_mem();
|
||||||
#endif
|
#endif
|
||||||
|
DisplayMallinfo();
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
|
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
|
||||||
|
@ -211,6 +211,7 @@ private:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
static void DisplayMallinfo(void);
|
||||||
static void NotifyDeletion(void * CpuPtr);
|
static void NotifyDeletion(void * CpuPtr);
|
||||||
static void Print(void);
|
static void Print(void);
|
||||||
static void PrintAll(void);
|
static void PrintAll(void);
|
||||||
|
@ -91,6 +91,7 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoarded(int dim)=0;
|
virtual int CheckerBoarded(int dim)=0;
|
||||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||||
|
virtual int CheckerDim(void){ return 0; };
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
||||||
|
@ -60,6 +60,7 @@ public:
|
|||||||
int _checker_dim;
|
int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
|
virtual int CheckerDim(void){ return _checker_dim; };
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
if( dim==_checker_dim) return 1;
|
if( dim==_checker_dim) return 1;
|
||||||
else return 0;
|
else return 0;
|
||||||
|
@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
// Might make all code paths go this way.
|
||||||
#if 0
|
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
{
|
|
||||||
autoView( left_v , left, AcceleratorRead);
|
|
||||||
autoView( right_v,right, AcceleratorRead);
|
|
||||||
// This code could read coalesce
|
|
||||||
// GPU - SIMT lane compliance...
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto x_l = left_v(ss);
|
|
||||||
auto y_l = right_v(ss);
|
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
deviceVector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
// This is in single precision and fails some tests
|
// This is in single precision and fails some tests
|
||||||
auto anrm = sumD(inner_tmp_v,sites);
|
auto anrm = sumD(inner_tmp_v,sites);
|
||||||
nrm = anrm;
|
nrm = anrm;
|
||||||
@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
#else
|
#else
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
deviceVector<inner_t> inner_tmp;
|
||||||
|
inner_tmp.resize(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
@ -13,6 +13,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
|
|||||||
mysum.resize(1);
|
mysum.resize(1);
|
||||||
sobj *mysum_p = & mysum[0];
|
sobj *mysum_p = & mysum[0];
|
||||||
sobj identity; zeroit(identity);
|
sobj identity; zeroit(identity);
|
||||||
|
mysum[0] = identity;
|
||||||
sobj ret ;
|
sobj ret ;
|
||||||
|
|
||||||
Integer nsimd= vobj::Nsimd();
|
Integer nsimd= vobj::Nsimd();
|
||||||
@ -80,6 +81,7 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
|||||||
d_sum.resize(1);
|
d_sum.resize(1);
|
||||||
Word *d_sum_p=&d_sum[0];
|
Word *d_sum_p=&d_sum[0];
|
||||||
Word identity; identity=0;
|
Word identity; identity=0;
|
||||||
|
d_sum[0] = identity;
|
||||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
||||||
|
@ -43,49 +43,20 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||||
{
|
{
|
||||||
half.Checkerboard() = cb;
|
acceleratorPickCheckerboard(cb,half,full);
|
||||||
|
|
||||||
autoView( half_v, half, CpuWrite);
|
|
||||||
autoView( full_v, full, CpuRead);
|
|
||||||
thread_for(ss, full.Grid()->oSites(),{
|
|
||||||
int cbos;
|
|
||||||
Coordinate coor;
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
|
||||||
|
|
||||||
if (cbos==cb) {
|
|
||||||
int ssh=half.Grid()->oIndex(coor);
|
|
||||||
half_v[ssh] = full_v[ss];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||||
{
|
{
|
||||||
int cb = half.Checkerboard();
|
acceleratorSetCheckerboard(full,half);
|
||||||
autoView( half_v , half, CpuRead);
|
|
||||||
autoView( full_v , full, CpuWrite);
|
|
||||||
thread_for(ss,full.Grid()->oSites(),{
|
|
||||||
|
|
||||||
Coordinate coor;
|
|
||||||
int cbos;
|
|
||||||
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
|
||||||
|
|
||||||
if (cbos==cb) {
|
|
||||||
int ssh=half.Grid()->oIndex(coor);
|
|
||||||
full_v[ss]=half_v[ssh];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
|
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
|
||||||
{
|
{
|
||||||
half.Checkerboard() = cb;
|
half.Checkerboard() = cb;
|
||||||
autoView(half_v, half, AcceleratorWrite);
|
autoView(half_v, half, AcceleratorWrite);
|
||||||
@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
|||||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||||
Coordinate ostride_half = half.Grid()->_ostride;
|
Coordinate ostride_half = half.Grid()->_ostride;
|
||||||
|
int checker_dim_half = half.Grid()->CheckerDim();
|
||||||
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
|
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
|
||||||
{
|
{
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
autoView(half_v , half, AcceleratorRead);
|
autoView(half_v , half, AcceleratorRead);
|
||||||
@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
|
|||||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||||
Coordinate ostride_half = half.Grid()->_ostride;
|
Coordinate ostride_half = half.Grid()->_ostride;
|
||||||
|
int checker_dim_half = half.Grid()->CheckerDim();
|
||||||
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
|
@ -90,6 +90,7 @@ public:
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
Parameters.StartingType = arg;
|
Parameters.StartingType = arg;
|
||||||
|
std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
|
if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
|
||||||
@ -97,6 +98,7 @@ public:
|
|||||||
std::vector<int> ivec(0);
|
std::vector<int> ivec(0);
|
||||||
GridCmdOptionIntVector(arg, ivec);
|
GridCmdOptionIntVector(arg, ivec);
|
||||||
Parameters.StartTrajectory = ivec[0];
|
Parameters.StartTrajectory = ivec[0];
|
||||||
|
std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
|
if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
|
||||||
@ -104,6 +106,7 @@ public:
|
|||||||
std::vector<int> ivec(0);
|
std::vector<int> ivec(0);
|
||||||
GridCmdOptionIntVector(arg, ivec);
|
GridCmdOptionIntVector(arg, ivec);
|
||||||
Parameters.Trajectories = ivec[0];
|
Parameters.Trajectories = ivec[0];
|
||||||
|
std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
|
if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
|
||||||
@ -111,6 +114,7 @@ public:
|
|||||||
std::vector<int> ivec(0);
|
std::vector<int> ivec(0);
|
||||||
GridCmdOptionIntVector(arg, ivec);
|
GridCmdOptionIntVector(arg, ivec);
|
||||||
Parameters.NoMetropolisUntil = ivec[0];
|
Parameters.NoMetropolisUntil = ivec[0];
|
||||||
|
std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
|
||||||
}
|
}
|
||||||
if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
|
if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
|
||||||
arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
|
arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
|
||||||
|
@ -137,9 +137,11 @@ public:
|
|||||||
|
|
||||||
double start_force = usecond();
|
double start_force = usecond();
|
||||||
|
|
||||||
|
MemoryManager::Print();
|
||||||
as[level].actions.at(a)->deriv_timer_start();
|
as[level].actions.at(a)->deriv_timer_start();
|
||||||
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
|
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
|
||||||
as[level].actions.at(a)->deriv_timer_stop();
|
as[level].actions.at(a)->deriv_timer_stop();
|
||||||
|
MemoryManager::Print();
|
||||||
|
|
||||||
auto name = as[level].actions.at(a)->action_name();
|
auto name = as[level].actions.at(a)->action_name();
|
||||||
|
|
||||||
@ -246,7 +248,11 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual ~Integrator() {}
|
virtual ~Integrator()
|
||||||
|
{
|
||||||
|
// Pain in the ass to clean up the Level pointers
|
||||||
|
// Guido's design is at fault as per comment above in constructor
|
||||||
|
}
|
||||||
|
|
||||||
virtual std::string integrator_name() = 0;
|
virtual std::string integrator_name() = 0;
|
||||||
|
|
||||||
@ -460,6 +466,7 @@ public:
|
|||||||
for (int level = 0; level < as.size(); ++level) {
|
for (int level = 0; level < as.size(); ++level) {
|
||||||
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
|
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
|
||||||
|
|
||||||
|
MemoryManager::Print();
|
||||||
// get gauge field from the SmearingPolicy and
|
// get gauge field from the SmearingPolicy and
|
||||||
// based on the boolean is_smeared in actionID
|
// based on the boolean is_smeared in actionID
|
||||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
|
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
|
||||||
@ -468,6 +475,7 @@ public:
|
|||||||
as[level].actions.at(actionID)->S_timer_stop();
|
as[level].actions.at(actionID)->S_timer_stop();
|
||||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
||||||
H += Hterm;
|
H += Hterm;
|
||||||
|
MemoryManager::Print();
|
||||||
|
|
||||||
}
|
}
|
||||||
as[level].apply(S_hireps, Representations, level, H);
|
as[level].apply(S_hireps, Representations, level, H);
|
||||||
|
@ -32,7 +32,9 @@ private:
|
|||||||
// Smear_Stout<Gimpl> *StoutSmearing;
|
// Smear_Stout<Gimpl> *StoutSmearing;
|
||||||
// std::vector<GaugeField> SmearedSet;
|
// std::vector<GaugeField> SmearedSet;
|
||||||
|
|
||||||
|
GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
|
||||||
std::vector<LatticeLorentzComplex> masks;
|
std::vector<LatticeLorentzComplex> masks;
|
||||||
|
std::vector<int> cbs;
|
||||||
|
|
||||||
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
||||||
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
||||||
@ -147,6 +149,25 @@ private:
|
|||||||
}
|
}
|
||||||
pokeLorentz(Fdet, Fdet_pol, nu);
|
pokeLorentz(Fdet, Fdet_pol, nu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Compute_MpInvJx_dNxxdSy(int cb,
|
||||||
|
const GaugeLinkField &PlaqL,
|
||||||
|
const GaugeLinkField &PlaqR,
|
||||||
|
AdjMatrixField MpInvJx,
|
||||||
|
AdjVectorField &Fdet2 )
|
||||||
|
{
|
||||||
|
GaugeLinkField PlaqLeo(UrbGrid);
|
||||||
|
GaugeLinkField PlaqReo(UrbGrid);
|
||||||
|
AdjMatrixField MpInvJxeo(UrbGrid);
|
||||||
|
AdjVectorField Fdet2eo(UrbGrid);
|
||||||
|
pickCheckerboard(cb,PlaqLeo,PlaqL);
|
||||||
|
pickCheckerboard(cb,PlaqReo,PlaqR);
|
||||||
|
pickCheckerboard(cb,MpInvJxeo,MpInvJx);
|
||||||
|
Fdet2eo.Checkerboard()=cb;
|
||||||
|
Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
|
||||||
|
setCheckerboard(Fdet2,Fdet2eo);
|
||||||
|
}
|
||||||
|
|
||||||
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
||||||
{
|
{
|
||||||
GaugeLinkField UtaU(PlaqL.Grid());
|
GaugeLinkField UtaU(PlaqL.Grid());
|
||||||
@ -278,6 +299,7 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Mask the gauge field
|
// Mask the gauge field
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int cb = cbs[smr];
|
||||||
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
||||||
|
|
||||||
Umsk = U;
|
Umsk = U;
|
||||||
@ -442,7 +464,7 @@ public:
|
|||||||
AdjMatrixField MpInvJx_nu(grid);
|
AdjMatrixField MpInvJx_nu(grid);
|
||||||
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||||
Fdet2_mu=FdetV;
|
Fdet2_mu=FdetV;
|
||||||
Fdet1_mu=Zero();
|
Fdet1_mu=Zero();
|
||||||
|
|
||||||
@ -499,7 +521,7 @@ public:
|
|||||||
|
|
||||||
time=-usecond();
|
time=-usecond();
|
||||||
PlaqR=(-1.0)*PlaqR;
|
PlaqR=(-1.0)*PlaqR;
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||||
Fdet2_nu = FdetV;
|
Fdet2_nu = FdetV;
|
||||||
time+=usecond();
|
time+=usecond();
|
||||||
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
||||||
@ -520,7 +542,7 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
///////////////// -ve nu /////////////////
|
///////////////// -ve nu /////////////////
|
||||||
@ -539,7 +561,7 @@ public:
|
|||||||
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
// x==
|
// x==
|
||||||
@ -560,7 +582,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||||
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
@ -589,7 +611,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_mu = Fdet2_mu+FdetV;
|
Fdet2_mu = Fdet2_mu+FdetV;
|
||||||
|
|
||||||
// __
|
// __
|
||||||
@ -609,7 +631,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_mu = Fdet2_mu+FdetV;
|
Fdet2_mu = Fdet2_mu+FdetV;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -931,6 +953,10 @@ private:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
/* Standard constructor */
|
/* Standard constructor */
|
||||||
|
virtual ~SmearedConfigurationMasked()
|
||||||
|
{
|
||||||
|
delete UrbGrid;
|
||||||
|
}
|
||||||
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
||||||
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
||||||
{
|
{
|
||||||
@ -939,7 +965,6 @@ public:
|
|||||||
// was resized in base class
|
// was resized in base class
|
||||||
assert(this->SmearedSet.size()==Nsmear);
|
assert(this->SmearedSet.size()==Nsmear);
|
||||||
|
|
||||||
GridRedBlackCartesian * UrbGrid;
|
|
||||||
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
||||||
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
||||||
LatticeComplex tmp(_UGrid);
|
LatticeComplex tmp(_UGrid);
|
||||||
@ -947,11 +972,12 @@ public:
|
|||||||
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
||||||
|
|
||||||
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
||||||
|
|
||||||
int mu= (i/2) %Nd;
|
int mu= (i/2) %Nd;
|
||||||
int cb= (i%2);
|
int cb= (i%2);
|
||||||
LatticeComplex tmpcb(UrbGrid);
|
LatticeComplex tmpcb(UrbGrid);
|
||||||
|
|
||||||
|
cbs.push_back(cb);
|
||||||
|
|
||||||
masks[i]=Zero();
|
masks[i]=Zero();
|
||||||
////////////////////
|
////////////////////
|
||||||
// Setup the mask
|
// Setup the mask
|
||||||
@ -962,7 +988,6 @@ public:
|
|||||||
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
delete UrbGrid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void smeared_force(GaugeField &SigmaTilde)
|
virtual void smeared_force(GaugeField &SigmaTilde)
|
||||||
|
@ -418,33 +418,33 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
|
|||||||
int hNNm1= NNm1/2;
|
int hNNm1= NNm1/2;
|
||||||
RealD sqrt_2 = sqrt(2.0);
|
RealD sqrt_2 = sqrt(2.0);
|
||||||
Complex ci(0.0,1.0);
|
Complex ci(0.0,1.0);
|
||||||
|
|
||||||
|
const int nsimd= Matrix::Nsimd();
|
||||||
|
accelerator_for(ss,grid->oSites(),nsimd,{
|
||||||
for(int su2Index=0;su2Index<hNNm1;su2Index++){
|
for(int su2Index=0;su2Index<hNNm1;su2Index++){
|
||||||
int i1, i2;
|
int i1, i2;
|
||||||
su2SubGroupIndex(i1, i2, su2Index);
|
su2SubGroupIndex(i1, i2, su2Index);
|
||||||
int ax = su2Index*2;
|
int ax = su2Index*2;
|
||||||
int ay = su2Index*2+1;
|
int ay = su2Index*2+1;
|
||||||
accelerator_for(ss,grid->oSites(),1,{
|
|
||||||
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
|
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
|
||||||
// trace( Ta x Ci in)
|
// trace( Ta x Ci in)
|
||||||
// Bet I need to move to real part with mult by -i
|
// Bet I need to move to real part with mult by -i
|
||||||
out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
|
coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
|
||||||
out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
|
coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
|
||||||
});
|
|
||||||
}
|
}
|
||||||
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
|
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
|
||||||
int k = diagIndex + 1; // diagIndex starts from 0
|
int k = diagIndex + 1; // diagIndex starts from 0
|
||||||
int a = NNm1+diagIndex;
|
int a = NNm1+diagIndex;
|
||||||
RealD scale = 1.0/sqrt(2.0*k*(k+1));
|
RealD scale = 1.0/sqrt(2.0*k*(k+1));
|
||||||
accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
|
auto tmp = in_v(ss)()()(0,0);
|
||||||
auto tmp = in_v[ss]()()(0,0);
|
|
||||||
for(int i=1;i<k;i++){
|
for(int i=1;i<k;i++){
|
||||||
tmp=tmp+in_v[ss]()()(i,i);
|
tmp=tmp+in_v(ss)()()(i,i);
|
||||||
|
}
|
||||||
|
tmp = tmp - in_v(ss)()()(k,k)*k;
|
||||||
|
coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
|
||||||
}
|
}
|
||||||
tmp = tmp - in_v[ss]()()(k,k)*k;
|
|
||||||
out_v[ss]()()(a,b) =imag(tmp) * scale;
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||||
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
||||||
|
|
||||||
int spare = su2_index;
|
int spare = su2_index;
|
||||||
|
@ -99,6 +99,8 @@ using std::log;
|
|||||||
using std::exp;
|
using std::exp;
|
||||||
using std::sin;
|
using std::sin;
|
||||||
using std::cos;
|
using std::cos;
|
||||||
|
using std::asin;
|
||||||
|
using std::acos;
|
||||||
|
|
||||||
|
|
||||||
accelerator_inline RealF conjugate(const RealF & r){ return r; }
|
accelerator_inline RealF conjugate(const RealF & r){ return r; }
|
||||||
|
@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {
|
|||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
|
||||||
|
template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
|
||||||
|
template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
|
||||||
|
#endif
|
||||||
|
@ -58,7 +58,7 @@ int main(int argc, char **argv) {
|
|||||||
HMCparameters HMCparams;
|
HMCparameters HMCparams;
|
||||||
HMCparams.StartTrajectory = 0;
|
HMCparams.StartTrajectory = 0;
|
||||||
HMCparams.Trajectories = 200;
|
HMCparams.Trajectories = 200;
|
||||||
HMCparams.NoMetropolisUntil= 20;
|
HMCparams.NoMetropolisUntil= 0;
|
||||||
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
||||||
HMCparams.StartingType =std::string("ColdStart");
|
HMCparams.StartingType =std::string("ColdStart");
|
||||||
HMCparams.MD = MD;
|
HMCparams.MD = MD;
|
||||||
@ -70,7 +70,7 @@ int main(int argc, char **argv) {
|
|||||||
CheckpointerParameters CPparams;
|
CheckpointerParameters CPparams;
|
||||||
CPparams.config_prefix = "ckpoint_EODWF_lat";
|
CPparams.config_prefix = "ckpoint_EODWF_lat";
|
||||||
CPparams.rng_prefix = "ckpoint_EODWF_rng";
|
CPparams.rng_prefix = "ckpoint_EODWF_rng";
|
||||||
CPparams.saveInterval = 10;
|
CPparams.saveInterval = 1;
|
||||||
CPparams.format = "IEEE64BIG";
|
CPparams.format = "IEEE64BIG";
|
||||||
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
||||||
|
|
||||||
@ -186,6 +186,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// HMC parameters are serialisable
|
// HMC parameters are serialisable
|
||||||
|
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
|
||||||
|
TheHMC.initializeGaugeFieldAndRNGs(U);
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
|
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
|
||||||
TheHMC.Run(); // no smearing
|
TheHMC.Run(); // no smearing
|
||||||
|
@ -261,23 +261,25 @@ public:
|
|||||||
fprintf(FP,"\n\n");
|
fprintf(FP,"\n\n");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class CComplex>
|
||||||
static void BLAS(void)
|
static void BLAS(void)
|
||||||
{
|
{
|
||||||
//int nbasis, int nrhs, int coarseVol
|
//int nbasis, int nrhs, int coarseVol
|
||||||
int basis[] = { 16,32,64 };
|
int basis[] = { 16,32,64 };
|
||||||
int rhs[] = { 8,16,32 };
|
int rhs[] = { 8,12,16 };
|
||||||
int vol = 4*4*4*4;
|
int vol = 8*8*8*8;
|
||||||
|
int blk = 4*4*4*4;
|
||||||
|
|
||||||
GridBLAS blas;
|
GridBLAS blas;
|
||||||
|
|
||||||
|
int fpbits = sizeof(CComplex)*4;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
|
std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
|
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
|
|
||||||
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
|
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);
|
||||||
|
|
||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
@ -285,7 +287,7 @@ public:
|
|||||||
int N=rhs[r];
|
int N=rhs[r];
|
||||||
int K=basis[b];
|
int K=basis[b];
|
||||||
int BATCH=vol;
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,N,K,BATCH);
|
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
|
|
||||||
@ -299,9 +301,9 @@ public:
|
|||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
int M=basis[b];
|
int M=basis[b];
|
||||||
int N=rhs[r];
|
int N=rhs[r];
|
||||||
int K=vol;
|
int K=blk;
|
||||||
int BATCH=vol;
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,N,K,BATCH);
|
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
@ -313,10 +315,10 @@ public:
|
|||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
int M=rhs[r];
|
int M=rhs[r];
|
||||||
int N=vol;
|
int N=blk;
|
||||||
int K=basis[b];
|
int K=basis[b];
|
||||||
int BATCH=vol;
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,N,K,BATCH);
|
double p=blas.benchmark<CComplex>(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
@ -867,6 +869,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_memory=1;
|
int do_memory=1;
|
||||||
int do_comms =1;
|
int do_comms =1;
|
||||||
int do_blas =1;
|
int do_blas =1;
|
||||||
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24,32});
|
std::vector<int> L_list({8,12,16,24,32});
|
||||||
@ -877,6 +880,7 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<double> staggered;
|
std::vector<double> staggered;
|
||||||
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
|
if (do_dslash){
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
|
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
@ -901,6 +905,7 @@ int main (int argc, char ** argv)
|
|||||||
staggered.push_back(result);
|
staggered.push_back(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
@ -909,8 +914,33 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
int NN=NN_global;
|
int NN=NN_global;
|
||||||
|
if(do_dslash){
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
||||||
|
fprintf(FP,"Per node summary table\n");
|
||||||
|
fprintf(FP,"\n");
|
||||||
|
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
||||||
|
fprintf(FP,"\n");
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
||||||
|
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
||||||
|
}
|
||||||
|
fprintf(FP,"\n");
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
||||||
|
std::cout<<std::setprecision(3);
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if ( do_memory ) {
|
if ( do_memory ) {
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
|
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
|
||||||
@ -918,15 +948,6 @@ int main (int argc, char ** argv)
|
|||||||
Benchmark::Memory();
|
Benchmark::Memory();
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( do_blas ) {
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
Benchmark::BLAS();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( do_su4 ) {
|
if ( do_su4 ) {
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
|
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
|
||||||
@ -941,27 +962,13 @@ int main (int argc, char ** argv)
|
|||||||
Benchmark::Comms();
|
Benchmark::Comms();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( do_blas ) {
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
Benchmark::BLAS<ComplexD>();
|
||||||
fprintf(FP,"Per node summary table\n");
|
Benchmark::BLAS<ComplexF>();
|
||||||
fprintf(FP,"\n");
|
|
||||||
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
|
||||||
fprintf(FP,"\n");
|
|
||||||
for(int l=0;l<L_list.size();l++){
|
|
||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
|
||||||
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
|
||||||
}
|
}
|
||||||
fprintf(FP,"\n");
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
|
||||||
std::cout<<std::setprecision(3);
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
fclose(FP);
|
fclose(FP);
|
||||||
|
@ -1,16 +1,18 @@
|
|||||||
|
|
||||||
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
|
--enable-debug \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
|
--with-lime=$CLIME \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-accelerator=sycl \
|
--enable-accelerator=sycl \
|
||||||
--enable-accelerator-aware-mpi=yes\
|
--enable-accelerator-aware-mpi=yes\
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx
|
||||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
|
|
||||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
|
|
||||||
|
|
||||||
|
23
systems/Aurora/config-command-leak
Normal file
23
systems/Aurora/config-command-leak
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
source ~/spack/share/spack/setup-env.sh
|
||||||
|
spack load c-lime
|
||||||
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
|
export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||||
|
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
../../configure \
|
||||||
|
--enable-debug \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--enable-gen-simd-width=64 \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--with-lime=$CLIME \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--enable-accelerator=sycl \
|
||||||
|
--enable-accelerator-aware-mpi=yes\
|
||||||
|
--enable-unified=no \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXX=icpx \
|
||||||
|
LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
|
||||||
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host -fsycl -fsanitize=leak "
|
||||||
|
|
22
systems/Aurora/config-command-sanitize
Normal file
22
systems/Aurora/config-command-sanitize
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# -fsycl-targets=spir64_gen -Xs\" -device pvc \"
|
||||||
|
# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
|
||||||
|
# -fsycl-targets=intel_gpu_pvc
|
||||||
|
|
||||||
|
unset DEVICE
|
||||||
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -Xarch_host -fsanitize=address"
|
||||||
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -Xarch_host -fsanitize=address -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
||||||
|
../../configure \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--enable-gen-simd-width=64 \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--enable-debug \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--with-lime=$CLIME \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--enable-accelerator=sycl \
|
||||||
|
--enable-accelerator-aware-mpi=yes\
|
||||||
|
--enable-unified=no \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXX=icpx
|
||||||
|
|
@ -1,14 +1,22 @@
|
|||||||
|
source ~/spack/share/spack/setup-env.sh
|
||||||
|
spack load c-lime
|
||||||
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
|
#spack load libefence
|
||||||
|
#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
|
||||||
|
#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
|
||||||
|
#spack load gperftools
|
||||||
|
export TCMALLOC=/home/paboyle/gperftools/install
|
||||||
|
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
export INTELGT_AUTO_ATTACH_DISABLE=1
|
||||||
|
|
||||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
||||||
|
#module load oneapi/release/2023.12.15.001
|
||||||
module load oneapi/release/2023.12.15.001
|
|
||||||
|
|
||||||
#module use /soft/modulefiles
|
#module use /soft/modulefiles
|
||||||
#module load intel_compute_runtime/release/agama-devel-682.22
|
#module load intel_compute_runtime/release/agama-devel-682.22
|
||||||
|
|
||||||
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
#export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||||
export FI_CXI_CQ_FILL_PERCENT=20
|
#export FI_CXI_CQ_FILL_PERCENT=20
|
||||||
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
|
||||||
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
|||||||
# -ftarget-register-alloc-mode=pvc:small
|
# -ftarget-register-alloc-mode=pvc:small
|
||||||
# -ftarget-register-alloc-mode=pvc:large
|
# -ftarget-register-alloc-mode=pvc:large
|
||||||
# -ftarget-register-alloc-mode=pvc:auto
|
# -ftarget-register-alloc-mode=pvc:auto
|
||||||
#
|
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||||
|
|
||||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
|
||||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||||
|
|
||||||
|
#source ~/spack/share/spack/setup-env.sh
|
||||||
|
#spack load gperftools
|
||||||
|
#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||||
|
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
@ -3,7 +3,7 @@ spack load c-lime
|
|||||||
module load emacs
|
module load emacs
|
||||||
module load PrgEnv-gnu
|
module load PrgEnv-gnu
|
||||||
module load rocm
|
module load rocm
|
||||||
module load cray-mpich/8.1.23
|
module load cray-mpich
|
||||||
module load gmp
|
module load gmp
|
||||||
module load cray-fftw
|
module load cray-fftw
|
||||||
module load craype-accel-amd-gfx90a
|
module load craype-accel-amd-gfx90a
|
||||||
|
@ -142,7 +142,9 @@ int main (int argc, char ** argv)
|
|||||||
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
|
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
|
||||||
int iters;
|
int iters;
|
||||||
|
|
||||||
|
time_t now;
|
||||||
time_t start = time(NULL);
|
time_t start = time(NULL);
|
||||||
|
UGrid->Broadcast(0,(void *)&start,sizeof(start));
|
||||||
|
|
||||||
FlightRecorder::ContinueOnFail = 0;
|
FlightRecorder::ContinueOnFail = 0;
|
||||||
FlightRecorder::PrintEntireLog = 0;
|
FlightRecorder::PrintEntireLog = 0;
|
||||||
@ -176,7 +178,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout << " FlightRecorder is OK! "<<std::endl;
|
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||||
iter ++;
|
iter ++;
|
||||||
} while (time(NULL) < (start + nsecs/10) );
|
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
|
||||||
|
} while (now < (start + nsecs/10) );
|
||||||
|
|
||||||
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
|
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
|
||||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||||
@ -201,8 +204,9 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
||||||
assert(FlightRecorder::ErrorCount()==0);
|
assert(FlightRecorder::ErrorCount()==0);
|
||||||
std::cout << " FlightRecorder is OK! "<<std::endl;
|
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||||
|
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
|
||||||
i++;
|
i++;
|
||||||
} while (time(NULL) < (start + nsecs) );
|
} while (now < (start + nsecs) );
|
||||||
|
|
||||||
LatticeFermionD diff_o(FrbGrid);
|
LatticeFermionD diff_o(FrbGrid);
|
||||||
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
||||||
|
118
tests/debug/Test_8888.cc
Normal file
118
tests/debug/Test_8888.cc
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_general_coarse_hdcg.cc
|
||||||
|
|
||||||
|
Copyright (C) 2023
|
||||||
|
|
||||||
|
Author: Peter Boyle <pboyle@bnl.gov>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||||
|
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
const int Ls=8;
|
||||||
|
const int nbasis = 40;
|
||||||
|
const int cb = 0 ;
|
||||||
|
RealD mass=0.01;
|
||||||
|
RealD M5=1.8;
|
||||||
|
RealD b=1.0;
|
||||||
|
RealD c=0.0;
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
///////////////////////// RNGs /////////////////////////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
std::vector<int> cseeds({5,6,7,8});
|
||||||
|
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
|
||||||
|
///////////////////////// Configuration /////////////////////////////////
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
|
||||||
|
FieldMetaData header;
|
||||||
|
std::string file("ckpoint_EODWF_lat.125");
|
||||||
|
NerscIO::readConfiguration(Umu,header,file);
|
||||||
|
|
||||||
|
//////////////////////// Fermion action //////////////////////////////////
|
||||||
|
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
|
||||||
|
|
||||||
|
MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf);
|
||||||
|
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Fine Power method "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
LatticeFermionD pm_src(FGrid);
|
||||||
|
pm_src = ComplexD(1.0);
|
||||||
|
PowerMethod<LatticeFermionD> fPM;
|
||||||
|
fPM(HermOp,pm_src);
|
||||||
|
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Fine Lanczos (poly, low) "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
int Nk=80;
|
||||||
|
int Nm=Nk*3;
|
||||||
|
int Nstop=8;
|
||||||
|
int Nconv_test_interval=1;
|
||||||
|
|
||||||
|
// Chebyshev<LatticeFermionD> IRLChebyLo(0.2,64.0,201); // 1 iter
|
||||||
|
Chebyshev<LatticeFermionD> IRLChebyLo(0.0,55.0,101); // 1 iter
|
||||||
|
FunctionHermOp<LatticeFermionD> PolyOp(IRLChebyLo,HermOp);
|
||||||
|
PlainHermOp<LatticeFermionD> Op(HermOp);
|
||||||
|
|
||||||
|
ImplicitlyRestartedLanczos IRL(PolyOp,
|
||||||
|
Op,
|
||||||
|
Nk, // sought vecs
|
||||||
|
Nk, // sought vecs
|
||||||
|
Nm, // spare vecs
|
||||||
|
1.0e-8,
|
||||||
|
10 // Max iterations
|
||||||
|
);
|
||||||
|
|
||||||
|
int Nconv;
|
||||||
|
std::vector<RealD> eval(Nm);
|
||||||
|
std::vector<LatticeFermionD> evec(Nm,FGrid);
|
||||||
|
LatticeFermionD irl_src(FGrid);
|
||||||
|
|
||||||
|
IRL.calc(eval,evec,irl_src,Nconv);
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
@ -392,9 +392,27 @@ void TestCGschur(What & Ddwf,
|
|||||||
GridParallelRNG *RNG5)
|
GridParallelRNG *RNG5)
|
||||||
{
|
{
|
||||||
LatticeFermion src (FGrid); random(*RNG5,src);
|
LatticeFermion src (FGrid); random(*RNG5,src);
|
||||||
LatticeFermion result(FGrid); result=Zero();
|
LatticeFermion result1(FGrid); result1=Zero();
|
||||||
|
LatticeFermion result2(FGrid); result2=Zero();
|
||||||
|
LatticeFermion result3(FGrid); result3=Zero();
|
||||||
|
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
|
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
|
||||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
|
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
|
||||||
SchurSolver(Ddwf,src,result);
|
SchurSolver(Ddwf,src,result1);
|
||||||
|
|
||||||
|
SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
|
||||||
|
SchurSolverSymm1(Ddwf,src,result2);
|
||||||
|
|
||||||
|
SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
|
||||||
|
SchurSolverSymm2(Ddwf,src,result3);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << " Symm1 " <<norm2(result2)<<std::endl;
|
||||||
|
result2=result2-result1;
|
||||||
|
std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << " Symm2 " <<norm2(result3)<<std::endl;
|
||||||
|
result3=result3-result1;
|
||||||
|
std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl;
|
||||||
}
|
}
|
||||||
|
@ -244,7 +244,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||||
|
|
||||||
|
#if 0
|
||||||
MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
|
MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
|
||||||
typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
|
typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
|
||||||
|
|
||||||
@ -308,6 +308,7 @@ int main (int argc, char ** argv)
|
|||||||
mrhsCG(MrhsCoarseOp,rh_src,rh_res);
|
mrhsCG(MrhsCoarseOp,rh_src,rh_res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
|
@ -145,7 +145,7 @@ int main (int argc, char ** argv)
|
|||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
const int Ls=24;
|
const int Ls=24;
|
||||||
const int nbasis = 60;
|
const int nbasis = 62;
|
||||||
const int cb = 0 ;
|
const int cb = 0 ;
|
||||||
RealD mass=0.00078;
|
RealD mass=0.00078;
|
||||||
RealD M5=1.8;
|
RealD M5=1.8;
|
||||||
@ -160,7 +160,7 @@ int main (int argc, char ** argv)
|
|||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
// Construct a coarsened grid with 4^4 cell
|
// Construct a coarsened grid with 4^4 cell
|
||||||
Coordinate Block({4,4,4,4});
|
Coordinate Block({4,4,6,4});
|
||||||
Coordinate clatt = GridDefaultLatt();
|
Coordinate clatt = GridDefaultLatt();
|
||||||
for(int d=0;d<clatt.size();d++){
|
for(int d=0;d<clatt.size();d++){
|
||||||
clatt[d] = clatt[d]/Block[d];
|
clatt[d] = clatt[d]/Block[d];
|
||||||
|
@ -160,7 +160,8 @@ int main (int argc, char ** argv)
|
|||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
// Construct a coarsened grid with 4^4 cell
|
// Construct a coarsened grid with 4^4 cell
|
||||||
Coordinate Block({4,4,6,6});
|
// Coordinate Block({4,4,6,4});
|
||||||
|
Coordinate Block({4,4,4,4});
|
||||||
Coordinate clatt = GridDefaultLatt();
|
Coordinate clatt = GridDefaultLatt();
|
||||||
for(int d=0;d<clatt.size();d++){
|
for(int d=0;d<clatt.size();d++){
|
||||||
clatt[d] = clatt[d]/Block[d];
|
clatt[d] = clatt[d]/Block[d];
|
||||||
@ -217,7 +218,7 @@ int main (int argc, char ** argv)
|
|||||||
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
|
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
|
||||||
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
|
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
|
||||||
bool load_agg=true;
|
bool load_agg=true;
|
||||||
bool load_refine=false;
|
bool load_refine=true;
|
||||||
bool load_mat=false;
|
bool load_mat=false;
|
||||||
bool load_evec=false;
|
bool load_evec=false;
|
||||||
|
|
||||||
@ -276,17 +277,25 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << "**************************************"<<std::endl;
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
||||||
Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,301); // 1 iter
|
// Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,301); // 4.4.6.4
|
||||||
|
// Chebyshev<CoarseVector> IRLCheby(0.0012,42.0,501); // for 4.4.4.4 blocking 350 evs
|
||||||
|
// Chebyshev<CoarseVector> IRLCheby(0.0014,42.0,501); // for 4.4.4.4 blocking 700 evs
|
||||||
|
// Chebyshev<CoarseVector> IRLCheby(0.002,42.0,501); // for 4.4.4.4 blocking 1226 evs
|
||||||
|
// Chebyshev<CoarseVector> IRLCheby(0.0025,42.0,501); // for 4.4.4.4 blocking 1059 evs
|
||||||
|
// 3e-4,2);
|
||||||
|
Chebyshev<CoarseVector> IRLCheby(0.0018,42.0,301); // for 4.4.4.4 blocking // 790 evs
|
||||||
|
|
||||||
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
||||||
|
|
||||||
CoarseVector pm_src(CoarseMrhs);
|
CoarseVector pm_src(CoarseMrhs);
|
||||||
pm_src = ComplexD(1.0);
|
pm_src = ComplexD(1.0);
|
||||||
PowerMethod<CoarseVector> cPM; cPM(MrhsCoarseOp,pm_src);
|
PowerMethod<CoarseVector> cPM; cPM(MrhsCoarseOp,pm_src);
|
||||||
|
|
||||||
int Nk=nrhs*30;
|
// int Nk=nrhs*30; // 4.4.6.4
|
||||||
// int Nk=nrhs*80;
|
// int Nk=nrhs*80;
|
||||||
int Nm=Nk*4;
|
int Nk=nrhs*60; // 720
|
||||||
int Nstop=Nk;
|
int Nm=Nk*4; // 2880 ; generally finishes at 1440
|
||||||
|
int Nstop=512;
|
||||||
int Nconv_test_interval=1;
|
int Nconv_test_interval=1;
|
||||||
|
|
||||||
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
|
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
|
||||||
@ -299,7 +308,7 @@ int main (int argc, char ** argv)
|
|||||||
nrhs,
|
nrhs,
|
||||||
Nk,
|
Nk,
|
||||||
Nm,
|
Nm,
|
||||||
1e-4,20);
|
3e-4,2);
|
||||||
|
|
||||||
std::vector<RealD> eval(Nm);
|
std::vector<RealD> eval(Nm);
|
||||||
std::vector<CoarseVector> evec(Nm,Coarse5d);
|
std::vector<CoarseVector> evec(Nm,Coarse5d);
|
||||||
@ -331,7 +340,7 @@ int main (int argc, char ** argv)
|
|||||||
// Extra HDCG parameters
|
// Extra HDCG parameters
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
int maxit=3000;
|
int maxit=3000;
|
||||||
ConjugateGradient<CoarseVector> CG(5.0e-2,maxit,false);
|
ConjugateGradient<CoarseVector> CG(7.5e-2,maxit,false);
|
||||||
RealD lo=2.0;
|
RealD lo=2.0;
|
||||||
int ord = 7;
|
int ord = 7;
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user