mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-14 13:57:07 +01:00
Compare commits
49 Commits
feature/de
...
199818bd6c
Author | SHA1 | Date | |
---|---|---|---|
199818bd6c | |||
fe66c7ca30 | |||
e9177e4af3 | |||
d15a6c5933 | |||
25ab9325e7 | |||
19f9378b98 | |||
9ffd1ed4ce | |||
3d014864e2 | |||
1d22841811 | |||
a1cdda833f | |||
ad6db92690 | |||
e8ff9d8e50 | |||
795769c636 | |||
267a39d943 | |||
3624bd3d22 | |||
bc12dbbb38 | |||
eb8a008a8f | |||
c4d9aa1a21 | |||
6ae809ed40 | |||
311e2aab3f | |||
438dfbdb83 | |||
b2ce760cf4 | |||
ba9bbe0221 | |||
4c3dd82d84 | |||
44e911b5b7 | |||
a7a16df9d0 | |||
382e0abefd | |||
6fdefe5b90 | |||
4788dd8e2e | |||
1cc5f221f3 | |||
93251bfba0 | |||
18b79508b8 | |||
4de5ed1613 | |||
0baaddbe98 | |||
b50fb34e71 | |||
de84d730ff | |||
c74d11e3d7 | |||
84cab5e6e7 | |||
c4fc972fec | |||
3f3661a86f | |||
5a4f9bf2e3 | |||
f617468e04 | |||
ee4046fe92 | |||
2a9cfeb9ea | |||
1147b8ea40 | |||
3f9119b39d | |||
35e8225abd | |||
bdbfbb7a14 | |||
f7d4be8d96 |
@ -191,7 +191,7 @@ public:
|
|||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
std::cout << "CPU view" << std::endl;
|
//std::cout << "CPU view" << std::endl;
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
@ -215,7 +215,7 @@ public:
|
|||||||
else if ( sign == forward ) div = 1.0;
|
else if ( sign == forward ) div = 1.0;
|
||||||
else assert(0);
|
else assert(0);
|
||||||
|
|
||||||
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
{
|
{
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||||
@ -229,7 +229,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||||
Coordinate lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
@ -251,7 +251,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
@ -274,7 +274,7 @@ public:
|
|||||||
usec += timer.useconds();
|
usec += timer.useconds();
|
||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
@ -291,7 +291,7 @@ public:
|
|||||||
}
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
#endif
|
#endif
|
||||||
|
@ -208,8 +208,8 @@ public:
|
|||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
//assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
@ -367,28 +367,67 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
} );
|
} );
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -414,8 +453,8 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
//assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
@ -514,28 +553,70 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
} );
|
} );
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -661,28 +742,40 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -809,28 +902,40 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
|
@ -144,11 +144,11 @@ public:
|
|||||||
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
||||||
}
|
}
|
||||||
RealD t4 = usecond();
|
RealD t4 = usecond();
|
||||||
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
||||||
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
||||||
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
||||||
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
||||||
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
||||||
@ -242,16 +242,16 @@ public:
|
|||||||
RealD flops = 8.0*M*N*K;
|
RealD flops = 8.0*M*N*K;
|
||||||
flops = flops/(t4-t3)/1.e3;
|
flops = flops/(t4-t3)/1.e3;
|
||||||
bytes = bytes/(t4-t3)/1.e3;
|
bytes = bytes/(t4-t3)/1.e3;
|
||||||
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||||
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||||
#else
|
#else
|
||||||
int nrhs;
|
int nrhs;
|
||||||
GridBase *grid;
|
GridBase *grid;
|
||||||
@ -358,17 +358,17 @@ public:
|
|||||||
flops = flops/(t4-t3)/1.e3;
|
flops = flops/(t4-t3)/1.e3;
|
||||||
bytes = bytes/(t4-t3)/1.e3;
|
bytes = bytes/(t4-t3)/1.e3;
|
||||||
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
||||||
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||||
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
||||||
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -63,7 +63,12 @@ class TwoLevelCGmrhs
|
|||||||
GridStopWatch SmoothTimer;
|
GridStopWatch SmoothTimer;
|
||||||
GridStopWatch InsertTimer;
|
GridStopWatch InsertTimer;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Field rrr;
|
||||||
|
Field sss;
|
||||||
|
Field qqq;
|
||||||
|
Field zzz;
|
||||||
|
*/
|
||||||
// more most opertor functions
|
// more most opertor functions
|
||||||
TwoLevelCGmrhs(RealD tol,
|
TwoLevelCGmrhs(RealD tol,
|
||||||
Integer maxit,
|
Integer maxit,
|
||||||
@ -74,6 +79,12 @@ class TwoLevelCGmrhs
|
|||||||
MaxIterations(maxit),
|
MaxIterations(maxit),
|
||||||
_FineLinop(FineLinop),
|
_FineLinop(FineLinop),
|
||||||
_Smoother(Smoother)
|
_Smoother(Smoother)
|
||||||
|
/*
|
||||||
|
rrr(fine),
|
||||||
|
sss(fine),
|
||||||
|
qqq(fine),
|
||||||
|
zzz(fine)
|
||||||
|
*/
|
||||||
{
|
{
|
||||||
grid = fine;
|
grid = fine;
|
||||||
};
|
};
|
||||||
@ -81,8 +92,8 @@ class TwoLevelCGmrhs
|
|||||||
// Vector case
|
// Vector case
|
||||||
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
||||||
{
|
{
|
||||||
SolveSingleSystem(src,x);
|
// SolveSingleSystem(src,x);
|
||||||
// SolvePrecBlockCG(src,x);
|
SolvePrecBlockCG(src,x);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -657,6 +668,8 @@ public:
|
|||||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||||
|
|
||||||
|
// this->rrr=in[0];
|
||||||
|
|
||||||
#undef SMOOTHER_BLOCK_SOLVE
|
#undef SMOOTHER_BLOCK_SOLVE
|
||||||
#if SMOOTHER_BLOCK_SOLVE
|
#if SMOOTHER_BLOCK_SOLVE
|
||||||
this->SmoothTimer.Start();
|
this->SmoothTimer.Start();
|
||||||
@ -669,6 +682,7 @@ public:
|
|||||||
this->SmoothTimer.Stop();
|
this->SmoothTimer.Stop();
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
// this->sss=Min[0];
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
|
|
||||||
@ -705,9 +719,11 @@ public:
|
|||||||
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
|
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
|
||||||
this->PromoteTimer.Stop();
|
this->PromoteTimer.Stop();
|
||||||
this->FineTimer.Start();
|
this->FineTimer.Start();
|
||||||
|
// this->qqq=tmp[0];
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
|
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
|
||||||
}
|
}
|
||||||
|
// this->zzz=out[0];
|
||||||
this->FineTimer.Stop();
|
this->FineTimer.Stop();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -74,7 +74,7 @@ public:
|
|||||||
|
|
||||||
void operator() (const Field &src, Field &psi){
|
void operator() (const Field &src, Field &psi){
|
||||||
|
|
||||||
psi=Zero();
|
// psi=Zero();
|
||||||
RealD cp, ssq,rsq;
|
RealD cp, ssq,rsq;
|
||||||
ssq=norm2(src);
|
ssq=norm2(src);
|
||||||
rsq=Tolerance*Tolerance*ssq;
|
rsq=Tolerance*Tolerance*ssq;
|
||||||
|
@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
inline RealD AggregatePowerLaw(RealD x)
|
inline RealD AggregatePowerLaw(RealD x)
|
||||||
@ -124,6 +126,53 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
|
||||||
|
{
|
||||||
|
RealD scale;
|
||||||
|
|
||||||
|
TrivialPrecon<FineField> simple_fine;
|
||||||
|
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
|
||||||
|
FineField noise(FineGrid);
|
||||||
|
FineField src(FineGrid);
|
||||||
|
FineField guess(FineGrid);
|
||||||
|
FineField Mn(FineGrid);
|
||||||
|
|
||||||
|
for(int b=0;b<nn;b++){
|
||||||
|
|
||||||
|
subspace[b] = Zero();
|
||||||
|
gaussian(RNG,noise);
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
||||||
|
|
||||||
|
for(int i=0;i<3;i++){
|
||||||
|
// void operator() (const Field &src, Field &psi){
|
||||||
|
#if 1
|
||||||
|
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
||||||
|
src = noise;
|
||||||
|
guess=Zero();
|
||||||
|
GCR(src,guess);
|
||||||
|
subspace[b] = guess;
|
||||||
|
#else
|
||||||
|
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
|
||||||
|
src=Zero();
|
||||||
|
guess = noise;
|
||||||
|
GCR(src,guess);
|
||||||
|
subspace[b] = guess;
|
||||||
|
#endif
|
||||||
|
noise = subspace[b];
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
|
||||||
|
subspace[b] = noise;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||||
// and this is the best I found
|
// and this is the best I found
|
||||||
@ -160,14 +209,21 @@ public:
|
|||||||
|
|
||||||
int b =0;
|
int b =0;
|
||||||
{
|
{
|
||||||
|
ComplexD ip;
|
||||||
// Filter
|
// Filter
|
||||||
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||||
Cheb(hermop,noise,Mn);
|
Cheb(hermop,noise,Mn);
|
||||||
// normalise
|
// normalise
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
subspace[b] = Mn;
|
subspace[b] = Mn;
|
||||||
|
|
||||||
hermop.Op(Mn,tmp);
|
hermop.Op(Mn,tmp);
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
ip= innerProduct(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||||
|
|
||||||
|
hermop.AdjOp(Mn,tmp);
|
||||||
|
ip = innerProduct(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -213,8 +269,18 @@ public:
|
|||||||
Mn=*Tnp;
|
Mn=*Tnp;
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
subspace[b] = Mn;
|
subspace[b] = Mn;
|
||||||
|
|
||||||
|
|
||||||
|
ComplexD ip;
|
||||||
|
|
||||||
hermop.Op(Mn,tmp);
|
hermop.Op(Mn,tmp);
|
||||||
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
ip= innerProduct(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||||
|
|
||||||
|
hermop.AdjOp(Mn,tmp);
|
||||||
|
ip = innerProduct(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||||
|
|
||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -228,6 +294,70 @@ public:
|
|||||||
}
|
}
|
||||||
assert(b==nn);
|
assert(b==nn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
|
int nn,
|
||||||
|
double hi,
|
||||||
|
double lo1,
|
||||||
|
int orderfilter,
|
||||||
|
double lo2,
|
||||||
|
int orderstep)
|
||||||
|
{
|
||||||
|
RealD scale;
|
||||||
|
|
||||||
|
FineField noise(FineGrid);
|
||||||
|
FineField Mn(FineGrid);
|
||||||
|
FineField tmp(FineGrid);
|
||||||
|
|
||||||
|
// New normalised noise
|
||||||
|
gaussian(RNG,noise);
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
|
||||||
|
// Initial matrix element
|
||||||
|
hermop.Op(noise,Mn);
|
||||||
|
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
|
int b =0;
|
||||||
|
{
|
||||||
|
// Filter
|
||||||
|
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
|
||||||
|
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
|
||||||
|
Cheb(hermop,noise,Mn);
|
||||||
|
// normalise
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
|
subspace[b] = Mn;
|
||||||
|
hermop.Op(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate a full sequence of Chebyshevs
|
||||||
|
for(int n=1;n<nn;n++){
|
||||||
|
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
|
||||||
|
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
|
||||||
|
Cheb(hermop,subspace[n-1],Mn);
|
||||||
|
|
||||||
|
for(int m=0;m<n;m++){
|
||||||
|
ComplexD c = innerProduct(subspace[m],Mn);
|
||||||
|
Mn = Mn - c*subspace[m];
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalise
|
||||||
|
scale = std::pow(norm2(Mn),-0.5);
|
||||||
|
Mn=Mn*scale;
|
||||||
|
|
||||||
|
subspace[n]=Mn;
|
||||||
|
|
||||||
|
hermop.Op(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
int nn,
|
int nn,
|
||||||
double hi,
|
double hi,
|
||||||
|
@ -69,7 +69,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
void construct(pointer __p, const _Tp& __val) { };
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
@ -175,10 +175,11 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
|||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
|
||||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
||||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
||||||
|
|
||||||
|
/*
|
||||||
template<class T> class vecView
|
template<class T> class vecView
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
|||||||
#define autoVecView(v_v,v,mode) \
|
#define autoVecView(v_v,v,mode) \
|
||||||
auto v_v = VectorView(v,mode); \
|
auto v_v = VectorView(v,mode); \
|
||||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||||
|
*/
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
|
|||||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
||||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
||||||
//#define dprintf(...)
|
//#define dprintf(...)
|
||||||
|
//#define mprintf(...)
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// For caching copies of data on device
|
// For caching copies of data on device
|
||||||
@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
AccCache.AccPtr=(uint64_t) NULL;
|
AccCache.AccPtr=(uint64_t) NULL;
|
||||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
|
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
|
||||||
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
||||||
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
||||||
if (AccCache.accLock!=0) return;
|
if (AccCache.accLock!=0) return;
|
||||||
@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)NULL;
|
AccCache.AccPtr=(uint64_t)NULL;
|
||||||
AccCache.state=CpuDirty; // CPU primary now
|
AccCache.state=CpuDirty; // CPU primary now
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||||
}
|
}
|
||||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
DeviceEvictions++;
|
DeviceEvictions++;
|
||||||
@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
|||||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
DeviceToHostBytes+=AccCache.bytes;
|
DeviceToHostBytes+=AccCache.bytes;
|
||||||
DeviceToHostXfer++;
|
DeviceToHostXfer++;
|
||||||
AccCache.state=Consistent;
|
AccCache.state=Consistent;
|
||||||
@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
DeviceBytes+=AccCache.bytes;
|
DeviceBytes+=AccCache.bytes;
|
||||||
}
|
}
|
||||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
|
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
|
||||||
(uint64_t)AccCache.bytes,
|
(uint64_t)AccCache.bytes,
|
||||||
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||||
{
|
{
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
|
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
|
||||||
AcceleratorViewClose((uint64_t)Ptr);
|
AcceleratorViewClose((uint64_t)Ptr);
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
CpuViewClose((uint64_t)Ptr);
|
CpuViewClose((uint64_t)Ptr);
|
||||||
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
{
|
{
|
||||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
|
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
|
||||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
}
|
}
|
||||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
if(bytes>=DeviceMaxBytes) {
|
||||||
|
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
||||||
|
}
|
||||||
assert(bytes<DeviceMaxBytes);
|
assert(bytes<DeviceMaxBytes);
|
||||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||||
if ( DeviceLRUBytes > 0){
|
if ( DeviceLRUBytes > 0){
|
||||||
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
assert(AccCache.cpuLock==0); // Programming error
|
assert(AccCache.cpuLock==0); // Programming error
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
if(AccCache.state!=Empty) {
|
||||||
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
|
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
|
||||||
(uint64_t)AccCache.CpuPtr,
|
(uint64_t)AccCache.CpuPtr,
|
||||||
(uint64_t)CpuPtr,
|
(uint64_t)CpuPtr,
|
||||||
(uint64_t)AccCache.bytes,
|
(uint64_t)AccCache.bytes,
|
||||||
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock= 1;
|
AccCache.accLock= 1;
|
||||||
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
|
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
|
||||||
} else if(AccCache.state==CpuDirty ){
|
} else if(AccCache.state==CpuDirty ){
|
||||||
if(mode==AcceleratorWriteDiscard) {
|
if(mode==AcceleratorWriteDiscard) {
|
||||||
CpuDiscard(AccCache);
|
CpuDiscard(AccCache);
|
||||||
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
|
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
|
||||||
} else if(AccCache.state==Consistent) {
|
} else if(AccCache.state==Consistent) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
|
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
|
||||||
} else if(AccCache.state==AccDirty) {
|
} else if(AccCache.state==AccDirty) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
|
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
// If view is opened on device must remove from LRU
|
// If view is opened on device must remove from LRU
|
||||||
if(AccCache.LRU_valid==1){
|
if(AccCache.LRU_valid==1){
|
||||||
// must possibly remove from LRU as now locked on GPU
|
// must possibly remove from LRU as now locked on GPU
|
||||||
dprintf("AccCache entry removed from LRU \n");
|
dprintf("AccCache entry removed from LRU ");
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
|||||||
AccCache.accLock--;
|
AccCache.accLock--;
|
||||||
// Move to LRU queue if not locked and close on device
|
// Move to LRU queue if not locked and close on device
|
||||||
if(AccCache.accLock==0) {
|
if(AccCache.accLock==0) {
|
||||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||||
LRUinsert(AccCache);
|
LRUinsert(AccCache);
|
||||||
} else {
|
} else {
|
||||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||||
|
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
#include <Grid/communicator/SharedMemory.h>
|
#include <Grid/communicator/SharedMemory.h>
|
||||||
|
|
||||||
|
#define NVLINK_GET
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern bool Stencil_force_mpi ;
|
extern bool Stencil_force_mpi ;
|
||||||
@ -147,6 +149,7 @@ public:
|
|||||||
sizeof(obj),d*100+p);
|
sizeof(obj),d*100+p);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
if (!list.empty()) // avoid triggering assert in comms == none
|
||||||
CommsComplete(list);
|
CommsComplete(list);
|
||||||
for(int p=1;p<_processors[d];p++){
|
for(int p=1;p<_processors[d];p++){
|
||||||
accum = accum + column[p];
|
accum = accum + column[p];
|
||||||
@ -192,6 +195,11 @@ public:
|
|||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,int do_recv,
|
int recv_from_rank,int do_recv,
|
||||||
int xbytes,int rbytes,int dir);
|
int xbytes,int rbytes,int dir);
|
||||||
|
|
||||||
|
// Could do a PollHtoD and have a CommsMerge dependence
|
||||||
|
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
||||||
|
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
|
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
@ -362,8 +363,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
std::vector<MpiCommsRequest_t> reqs(0);
|
std::vector<MpiCommsRequest_t> reqs(0);
|
||||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
|
||||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
|
||||||
|
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -379,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
communicator,MPI_STATUS_IGNORE);
|
communicator,MPI_STATUS_IGNORE);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
|
||||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
|
||||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
|
||||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
|
||||||
}
|
}
|
||||||
// Basic Halo comms primitive
|
// Basic Halo comms primitive
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
@ -399,6 +395,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
|
|
||||||
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
@ -561,53 +559,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
|
|
||||||
#ifdef DEVICE_TO_HOST_CONCURRENT
|
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
|
|
||||||
host_xmit = this->HostBufferMalloc(xbytes);
|
host_xmit = this->HostBufferMalloc(xbytes);
|
||||||
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
CommsRequest_t srq;
|
||||||
|
|
||||||
|
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||||
|
|
||||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
// assert(ierr==0);
|
// assert(ierr==0);
|
||||||
// off_node_bytes+=xbytes;
|
// off_node_bytes+=xbytes;
|
||||||
|
|
||||||
CommsRequest_t srq;
|
|
||||||
srq.PacketType = InterNodeXmit;
|
srq.PacketType = InterNodeXmit;
|
||||||
srq.bytes = xbytes;
|
srq.bytes = xbytes;
|
||||||
// srq.req = xrq;
|
// srq.req = xrq;
|
||||||
srq.host_buf = host_xmit;
|
srq.host_buf = host_xmit;
|
||||||
srq.device_buf = xmit;
|
srq.device_buf = xmit;
|
||||||
|
srq.tag = tag;
|
||||||
|
srq.dest = dest;
|
||||||
|
srq.commdir = commdir;
|
||||||
list.push_back(srq);
|
list.push_back(srq);
|
||||||
#else
|
|
||||||
tag= dir+_processor*32;
|
|
||||||
|
|
||||||
host_xmit = this->HostBufferMalloc(xbytes);
|
|
||||||
const int chunks=1;
|
|
||||||
for(int n=0;n<chunks;n++){
|
|
||||||
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
|
|
||||||
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
|
|
||||||
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
|
|
||||||
}
|
|
||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
|
||||||
|
|
||||||
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
||||||
assert(ierr==0);
|
|
||||||
off_node_bytes+=xbytes;
|
|
||||||
|
|
||||||
CommsRequest_t srq;
|
|
||||||
srq.PacketType = InterNodeXmit;
|
|
||||||
srq.bytes = xbytes;
|
|
||||||
srq.req = xrq;
|
|
||||||
srq.host_buf = host_xmit;
|
|
||||||
srq.device_buf = xmit;
|
|
||||||
list.push_back(srq);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* In the interest of better pipelining, poll for completion on each DtoH and
|
||||||
|
* start MPI_ISend in the meantime
|
||||||
|
*/
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
int pending = 0;
|
||||||
|
do {
|
||||||
|
|
||||||
|
pending = 0;
|
||||||
|
|
||||||
|
for(int idx = 0; idx<list.size();idx++){
|
||||||
|
|
||||||
|
if ( list[idx].PacketType==InterNodeRecv ) {
|
||||||
|
|
||||||
|
int flag = 0;
|
||||||
|
MPI_Status status;
|
||||||
|
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
if ( flag ) {
|
||||||
|
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
||||||
|
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
||||||
|
list[idx].PacketType=InterNodeReceiveHtoD;
|
||||||
|
} else {
|
||||||
|
pending ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
||||||
|
} while ( pending );
|
||||||
|
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
int pending = 0;
|
||||||
|
do {
|
||||||
|
|
||||||
|
pending = 0;
|
||||||
|
|
||||||
|
for(int idx = 0; idx<list.size();idx++){
|
||||||
|
|
||||||
|
if ( list[idx].PacketType==InterNodeXmit ) {
|
||||||
|
|
||||||
|
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
||||||
|
|
||||||
|
void *host_xmit = list[idx].host_buf;
|
||||||
|
uint32_t xbytes = list[idx].bytes;
|
||||||
|
int dest = list[idx].dest;
|
||||||
|
int tag = list[idx].tag;
|
||||||
|
int commdir = list[idx].commdir;
|
||||||
|
///////////////////
|
||||||
|
// Send packet
|
||||||
|
///////////////////
|
||||||
|
|
||||||
|
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
||||||
|
|
||||||
|
MPI_Request xrq;
|
||||||
|
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
list[idx].req = xrq; // Update the MPI request in the list
|
||||||
|
|
||||||
|
list[idx].PacketType=InterNodeXmitISend;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// not done, so return to polling loop
|
||||||
|
pending++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (pending);
|
||||||
|
}
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
@ -644,69 +694,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
* - complete all copies
|
* - complete all copies
|
||||||
* - post MPI send asynch
|
* - post MPI send asynch
|
||||||
*/
|
*/
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
if ( dor ) {
|
||||||
|
|
||||||
// static int printed;
|
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||||
// if((printed<8) && this->IsBoss() ) {
|
// Intranode
|
||||||
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||||
// printed++;
|
assert(shm!=NULL);
|
||||||
// }
|
|
||||||
|
|
||||||
|
CommsRequest_t srq;
|
||||||
|
|
||||||
|
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||||
|
|
||||||
|
srq.PacketType = IntraNodeRecv;
|
||||||
|
srq.bytes = xbytes;
|
||||||
|
// srq.req = xrq;
|
||||||
|
srq.host_buf = NULL;
|
||||||
|
srq.device_buf = xmit;
|
||||||
|
srq.tag = -1;
|
||||||
|
srq.dest = dest;
|
||||||
|
srq.commdir = dir;
|
||||||
|
list.push_back(srq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
if (dox) {
|
if (dox) {
|
||||||
|
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||||
#ifdef DEVICE_TO_HOST_CONCURRENT
|
// Intranode
|
||||||
tag= dir+_processor*32;
|
|
||||||
// Find the send in the prepared list
|
|
||||||
int list_idx=-1;
|
|
||||||
for(int idx = 0; idx<list.size();idx++){
|
|
||||||
|
|
||||||
if ( (list[idx].device_buf==xmit)
|
|
||||||
&&(list[idx].PacketType==InterNodeXmit)
|
|
||||||
&&(list[idx].bytes==xbytes) ) {
|
|
||||||
|
|
||||||
list_idx = idx;
|
|
||||||
host_xmit = list[idx].host_buf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(list_idx != -1); // found it
|
|
||||||
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
||||||
assert(ierr==0);
|
|
||||||
list[list_idx].req = xrq; // Update the MPI request in the list
|
|
||||||
off_node_bytes+=xbytes;
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
|
||||||
|
CommsRequest_t srq;
|
||||||
|
|
||||||
|
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
|
|
||||||
|
srq.PacketType = IntraNodeXmit;
|
||||||
|
srq.bytes = xbytes;
|
||||||
|
// srq.req = xrq;
|
||||||
|
srq.host_buf = NULL;
|
||||||
|
srq.device_buf = xmit;
|
||||||
|
srq.tag = -1;
|
||||||
|
srq.dest = dest;
|
||||||
|
srq.commdir = dir;
|
||||||
|
list.push_back(srq);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
||||||
|
|
||||||
if (nreq==0) return;
|
std::vector<MPI_Status> status;
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Request> MpiRequests;
|
||||||
std::vector<MPI_Request> MpiRequests(nreq);
|
|
||||||
|
|
||||||
for(int r=0;r<nreq;r++){
|
for(int r=0;r<list.size();r++){
|
||||||
MpiRequests[r] = list[r].req;
|
// Must check each Send buf is clear to reuse
|
||||||
|
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
||||||
|
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
||||||
}
|
}
|
||||||
|
|
||||||
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
|
int nreq=MpiRequests.size();
|
||||||
|
|
||||||
|
if (nreq>0) {
|
||||||
|
status.resize(MpiRequests.size());
|
||||||
|
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
|
||||||
for(int r=0;r<nreq;r++){
|
|
||||||
if ( list[r].PacketType==InterNodeRecv ) {
|
|
||||||
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
// for(int r=0;r<nreq;r++){
|
||||||
|
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||||
|
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
list.resize(0); // Delete the list
|
list.resize(0); // Delete the list
|
||||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||||
this->StencilBarrier();
|
#ifndef NVLINK_GET
|
||||||
|
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
{
|
{
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,int dox,
|
int xmit_to_rank,int dox,
|
||||||
@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
int recv_from_rank,int dor,
|
int recv_from_rank,int dor,
|
||||||
int xbytes,int rbytes, int dir)
|
int xbytes,int rbytes, int dir)
|
||||||
{
|
{
|
||||||
return xbytes+rbytes;
|
return 0.0;
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
|
@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
|
|||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
#else
|
#else
|
||||||
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
|
/*
|
||||||
|
* Enable state transitions as each packet flows.
|
||||||
|
*/
|
||||||
|
enum PacketType_t {
|
||||||
|
FaceGather,
|
||||||
|
InterNodeXmit,
|
||||||
|
InterNodeRecv,
|
||||||
|
IntraNodeXmit,
|
||||||
|
IntraNodeRecv,
|
||||||
|
InterNodeXmitISend,
|
||||||
|
InterNodeReceiveHtoD
|
||||||
|
};
|
||||||
|
/*
|
||||||
|
*Package arguments needed for various actions along packet flow
|
||||||
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PacketType_t PacketType;
|
PacketType_t PacketType;
|
||||||
void *host_buf;
|
void *host_buf;
|
||||||
void *device_buf;
|
void *device_buf;
|
||||||
|
int dest;
|
||||||
|
int tag;
|
||||||
|
int commdir;
|
||||||
unsigned long bytes;
|
unsigned long bytes;
|
||||||
|
acceleratorEvent_t ev;
|
||||||
MpiCommsRequest_t req;
|
MpiCommsRequest_t req;
|
||||||
} CommsRequest_t;
|
} CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
|
@ -542,7 +542,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
printf("Host buffer allocate for GPU non-aware MPI\n");
|
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||||
#if 0
|
#if 0
|
||||||
HostCommBuf= acceleratorAllocHost(bytes);
|
HostCommBuf= acceleratorAllocHost(bytes);
|
||||||
#else
|
#else
|
||||||
|
@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#if 1
|
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
int sshift[2];
|
int sshift[2];
|
||||||
@ -125,6 +125,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||||
|
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||||
|
#endif
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
// int rank = grid->_processor;
|
// int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
#else
|
||||||
|
// bouncy bouncy
|
||||||
|
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&hrecv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
@ -231,6 +248,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
send_buf_extract[s].resize(buffer_size);
|
send_buf_extract[s].resize(buffer_size);
|
||||||
recv_buf_extract[s].resize(buffer_size);
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
}
|
}
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
||||||
|
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||||
|
#endif
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
@ -283,11 +304,22 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
|
|
||||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)recv_buf_extract_mpi,
|
(void *)recv_buf_extract_mpi,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
#else
|
||||||
|
// bouncy bouncy
|
||||||
|
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&hrecv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
@ -311,234 +343,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
|
||||||
{
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
|
|
||||||
GridBase *grid=rhs.Grid();
|
|
||||||
Lattice<vobj> temp(rhs.Grid());
|
|
||||||
|
|
||||||
int fd = rhs.Grid()->_fdimensions[dimension];
|
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
|
||||||
int pd = rhs.Grid()->_processors[dimension];
|
|
||||||
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
|
||||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
|
||||||
assert(simd_layout==1);
|
|
||||||
assert(comm_dim==1);
|
|
||||||
assert(shift>=0);
|
|
||||||
assert(shift<fd);
|
|
||||||
RealD tcopy=0.0;
|
|
||||||
RealD tgather=0.0;
|
|
||||||
RealD tscatter=0.0;
|
|
||||||
RealD tcomms=0.0;
|
|
||||||
uint64_t xbytes=0;
|
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
|
||||||
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
|
||||||
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
|
||||||
vobj *send_buf;
|
|
||||||
vobj *recv_buf;
|
|
||||||
{
|
|
||||||
grid->ShmBufferFreeAll();
|
|
||||||
size_t bytes = buffer_size*sizeof(vobj);
|
|
||||||
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
|
||||||
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
|
||||||
|
|
||||||
if (comm_proc==0) {
|
|
||||||
|
|
||||||
tcopy-=usecond();
|
|
||||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
|
||||||
tcopy+=usecond();
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int words = buffer_size;
|
|
||||||
if (cbmask != 0x3) words=words>>1;
|
|
||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
|
||||||
|
|
||||||
tgather-=usecond();
|
|
||||||
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
|
||||||
tgather+=usecond();
|
|
||||||
|
|
||||||
// int rank = grid->_processor;
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
|
|
||||||
|
|
||||||
tcomms-=usecond();
|
|
||||||
// grid->Barrier();
|
|
||||||
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&recv_buf[0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
xbytes+=bytes;
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
|
||||||
|
|
||||||
// grid->Barrier();
|
|
||||||
tcomms+=usecond();
|
|
||||||
|
|
||||||
tscatter-=usecond();
|
|
||||||
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
|
||||||
tscatter+=usecond();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(Cshift_verbose){
|
|
||||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
|
||||||
{
|
|
||||||
GridBase *grid=rhs.Grid();
|
|
||||||
const int Nsimd = grid->Nsimd();
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
typedef typename vobj::scalar_object scalar_object;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
|
|
||||||
int fd = grid->_fdimensions[dimension];
|
|
||||||
int rd = grid->_rdimensions[dimension];
|
|
||||||
int ld = grid->_ldimensions[dimension];
|
|
||||||
int pd = grid->_processors[dimension];
|
|
||||||
int simd_layout = grid->_simd_layout[dimension];
|
|
||||||
int comm_dim = grid->_processors[dimension] >1 ;
|
|
||||||
|
|
||||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
|
||||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
|
||||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
|
||||||
|
|
||||||
assert(comm_dim==1);
|
|
||||||
assert(simd_layout==2);
|
|
||||||
assert(shift>=0);
|
|
||||||
assert(shift<fd);
|
|
||||||
RealD tcopy=0.0;
|
|
||||||
RealD tgather=0.0;
|
|
||||||
RealD tscatter=0.0;
|
|
||||||
RealD tcomms=0.0;
|
|
||||||
uint64_t xbytes=0;
|
|
||||||
|
|
||||||
int permute_type=grid->PermuteType(dimension);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
// Simd direction uses an extract/merge pair
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
|
||||||
|
|
||||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
|
||||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
|
||||||
scalar_object * recv_buf_extract_mpi;
|
|
||||||
scalar_object * send_buf_extract_mpi;
|
|
||||||
{
|
|
||||||
size_t bytes = sizeof(scalar_object)*buffer_size;
|
|
||||||
grid->ShmBufferFreeAll();
|
|
||||||
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
|
||||||
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
|
||||||
}
|
|
||||||
for(int s=0;s<Nsimd;s++){
|
|
||||||
send_buf_extract[s].resize(buffer_size);
|
|
||||||
recv_buf_extract[s].resize(buffer_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
|
||||||
|
|
||||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
|
||||||
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
|
||||||
|
|
||||||
///////////////////////////////////////////
|
|
||||||
// Work out what to send where
|
|
||||||
///////////////////////////////////////////
|
|
||||||
int cb = (cbmask==0x2)? Odd : Even;
|
|
||||||
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
|
||||||
|
|
||||||
// loop over outer coord planes orthog to dim
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
// FIXME call local permute copy if none are offnode.
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
pointers[i] = &send_buf_extract[i][0];
|
|
||||||
}
|
|
||||||
tgather-=usecond();
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
|
||||||
tgather+=usecond();
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
|
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
|
||||||
int ic= (i&inner_bit)? 1:0;
|
|
||||||
|
|
||||||
int my_coor = rd*ic + x;
|
|
||||||
int nbr_coor = my_coor+sshift;
|
|
||||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
|
||||||
|
|
||||||
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
|
||||||
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
|
||||||
int nbr_lane = (i&(~inner_bit));
|
|
||||||
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
|
|
||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
|
||||||
|
|
||||||
assert (sx == nbr_ox);
|
|
||||||
|
|
||||||
if(nbr_proc){
|
|
||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
|
|
||||||
tcomms-=usecond();
|
|
||||||
// grid->Barrier();
|
|
||||||
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
|
||||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)recv_buf_extract_mpi,
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
|
||||||
xbytes+=bytes;
|
|
||||||
|
|
||||||
// grid->Barrier();
|
|
||||||
tcomms+=usecond();
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
|
||||||
} else {
|
|
||||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
tscatter-=usecond();
|
|
||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
|
||||||
tscatter+=usecond();
|
|
||||||
|
|
||||||
}
|
|
||||||
if(Cshift_verbose){
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -466,6 +466,12 @@ public:
|
|||||||
static deviceVector<vobj> recv_buf;
|
static deviceVector<vobj> recv_buf;
|
||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
static hostVector<vobj> hsend_buf;
|
||||||
|
static hostVector<vobj> hrecv_buf;
|
||||||
|
hsend_buf.resize(buffer_size*2*depth);
|
||||||
|
hrecv_buf.resize(buffer_size*2*depth);
|
||||||
|
#endif
|
||||||
|
|
||||||
std::vector<MpiCommsRequest_t> fwd_req;
|
std::vector<MpiCommsRequest_t> fwd_req;
|
||||||
std::vector<MpiCommsRequest_t> bwd_req;
|
std::vector<MpiCommsRequest_t> bwd_req;
|
||||||
@ -495,9 +501,17 @@ public:
|
|||||||
t_gather+=usecond()-t;
|
t_gather+=usecond()-t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
grid->SendToRecvFromBegin(fwd_req,
|
grid->SendToRecvFromBegin(fwd_req,
|
||||||
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||||
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||||
|
#else
|
||||||
|
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
|
||||||
|
grid->SendToRecvFromBegin(fwd_req,
|
||||||
|
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
||||||
|
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||||
|
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
||||||
|
#endif
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
@ -508,9 +522,17 @@ public:
|
|||||||
t_gather+= usecond() - t;
|
t_gather+= usecond() - t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
grid->SendToRecvFromBegin(bwd_req,
|
grid->SendToRecvFromBegin(bwd_req,
|
||||||
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||||
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||||
|
#else
|
||||||
|
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
|
||||||
|
grid->SendToRecvFromBegin(bwd_req,
|
||||||
|
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||||
|
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||||
|
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
||||||
|
#endif
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -484,6 +484,11 @@ public:
|
|||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||||
|
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||||
|
// Or issue barrier AFTER the DMA is running
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v , st,AcceleratorRead);
|
autoView(st_v , st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
acceleratorFenceComputeStream();
|
// acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
// dependent on result of merge
|
// // dependent on result of merge
|
||||||
acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||||
|
@ -363,12 +363,16 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// std::cout << "Communicate Begin "<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
FlightRecorder::StepLog("Communicate begin");
|
FlightRecorder::StepLog("Communicate begin");
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
// But the HaloGather had a barrier too.
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
@ -376,8 +380,15 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
|
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
|
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
|
||||||
|
// std::cout << "Communicate CopySynch "<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
|
// Starts intranode
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
// std::cout << "Communicate Begin "<<i<<std::endl;
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
@ -395,7 +406,14 @@ public:
|
|||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// std::cout << "Communicate Complete "<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
FlightRecorder::StepLog("Start communicate complete");
|
FlightRecorder::StepLog("Start communicate complete");
|
||||||
|
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
|
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
|
||||||
|
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
@ -483,6 +501,9 @@ public:
|
|||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
|
//////////////////////////////////
|
||||||
|
// I will overwrite my send buffers
|
||||||
|
//////////////////////////////////
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
@ -496,7 +517,11 @@ public:
|
|||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier(); // All my local gathers are complete
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
// _grid->StencilBarrier();// Synch shared memory on a single nodes
|
#ifdef NVLINK_GET
|
||||||
|
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||||
|
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||||
|
// Or issue barrier AFTER the DMA is running
|
||||||
|
#endif
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
}
|
}
|
||||||
@ -535,6 +560,7 @@ public:
|
|||||||
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
||||||
});
|
});
|
||||||
acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
|
// Also fenced in WilsonKernels
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -663,7 +689,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
|
||||||
surface_list.resize(surface_list_size);
|
surface_list.resize(surface_list_size);
|
||||||
std::vector<int> surface_list_host(surface_list_size);
|
std::vector<int> surface_list_host(surface_list_size);
|
||||||
int32_t ss=0;
|
int32_t ss=0;
|
||||||
@ -683,6 +708,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
||||||
|
std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
|
@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||||
}
|
}
|
||||||
@ -343,11 +343,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
|||||||
|
|
||||||
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
|
||||||
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
|
///////
|
||||||
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
|
// Asynch event interface
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
///////
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
typedef sycl::event acceleratorEvent_t;
|
||||||
|
|
||||||
|
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||||
|
{
|
||||||
|
ev.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
|
||||||
|
{
|
||||||
|
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
|
|
||||||
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -358,8 +375,10 @@ inline int acceleratorIsCommunicable(void *ptr)
|
|||||||
else return 0;
|
else return 0;
|
||||||
#endif
|
#endif
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
@ -492,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||||
@ -564,9 +583,9 @@ inline void acceleratorMem(void)
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorCopySynchronise(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
@ -649,15 +668,15 @@ accelerator_inline void acceleratorFence(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,T&host)
|
template<class T> void acceleratorPut(T& dev,const T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
|
||||||
}
|
}
|
||||||
template<class T> T acceleratorGet(T& dev)
|
template<class T> T acceleratorGet(T& dev)
|
||||||
{
|
{
|
||||||
|
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define thread_critical DO_PRAGMA(omp critical)
|
#define thread_critical DO_PRAGMA(omp critical)
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
uint64_t *ufrom = (uint64_t *)from;
|
const uint64_t *ufrom = (const uint64_t *)from;
|
||||||
uint64_t *uto = (uint64_t *)to;
|
uint64_t *uto = (uint64_t *)to;
|
||||||
assert(bytes%8==0);
|
assert(bytes%8==0);
|
||||||
uint64_t words=bytes/8;
|
uint64_t words=bytes/8;
|
||||||
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(from,to,bytes);
|
bcopy(from,to,bytes);
|
||||||
}
|
}
|
||||||
|
@ -509,6 +509,13 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_default_latt,
|
Grid_default_latt,
|
||||||
Grid_default_mpi);
|
Grid_default_mpi);
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
|
||||||
|
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
FlightRecorder::PrintEntireLog = 1;
|
||||||
|
FlightRecorder::ChecksumComms = 1;
|
||||||
|
FlightRecorder::ChecksumCommsSend=1;
|
||||||
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||||
@ -651,3 +658,4 @@ void Grid_debug_handler_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ namespace Grid{
|
|||||||
int64_t index64;
|
int64_t index64;
|
||||||
IndexFromCoorReversed(coor,index64,dims);
|
IndexFromCoorReversed(coor,index64,dims);
|
||||||
if ( index64>=2*1024*1024*1024LL ){
|
if ( index64>=2*1024*1024*1024LL ){
|
||||||
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
||||||
}
|
}
|
||||||
assert(index64<2*1024*1024*1024LL);
|
assert(index64<2*1024*1024*1024LL);
|
||||||
index = (int) index64;
|
index = (int) index64;
|
||||||
|
@ -52,7 +52,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
int Ls=8;
|
int Ls=16;
|
||||||
for(int i=0;i<argc;i++) {
|
for(int i=0;i<argc;i++) {
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
if(std::string(argv[i]) == "-Ls"){
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||||
|
@ -175,8 +175,8 @@ public:
|
|||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
|
|
||||||
dbytes=dbytes*ppn;
|
dbytes=dbytes*ppn;
|
||||||
double xbytes = dbytes*0.5;
|
double xbytes = dbytes;
|
||||||
double bidibytes = dbytes;
|
double bidibytes = dbytes*2.0;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
|
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
|
||||||
<< bytes << " \t "
|
<< bytes << " \t "
|
||||||
@ -492,17 +492,18 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
|
for(uint64_t j=0;j<ni;j++){
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -520,11 +521,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -535,6 +536,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -654,17 +656,19 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
|
for(uint64_t j=0;j<ni;j++){
|
||||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -675,11 +679,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -689,6 +693,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -792,19 +797,18 @@ public:
|
|||||||
Dc.M(src,r);
|
Dc.M(src,r);
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
uint64_t ni = 100;
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
double t0=usecond();
|
||||||
|
for(uint64_t j=0;j<ni;j++){
|
||||||
Dc.M(src,r);
|
Dc.M(src,r);
|
||||||
t1=usecond();
|
}
|
||||||
|
double t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@ -814,20 +818,21 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_dslash=1;
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24});
|
std::vector<int> L_list({8,12,16,24,32});
|
||||||
int selm1=sel-1;
|
int selm1=sel-1;
|
||||||
|
|
||||||
std::vector<double> clover;
|
std::vector<double> clover;
|
||||||
|
74
systems/Aurora/benchmarks/bench16.pbs
Normal file
74
systems/Aurora/benchmarks/bench16.pbs
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
##PBS -q LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -q debug-scaling
|
||||||
|
##PBS -q prod
|
||||||
|
#PBS -l select=16
|
||||||
|
#PBS -l walltime=00:20:00
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../sourceme.sh
|
||||||
|
|
||||||
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
|
||||||
|
#
|
||||||
|
# Local vol 16.16.16.32
|
||||||
|
#
|
||||||
|
|
||||||
|
LX=16
|
||||||
|
LY=16
|
||||||
|
LZ=16
|
||||||
|
LT=32
|
||||||
|
|
||||||
|
NX=2
|
||||||
|
NY=2
|
||||||
|
NZ=4
|
||||||
|
NT=1
|
||||||
|
|
||||||
|
GX=2
|
||||||
|
GY=2
|
||||||
|
GZ=1
|
||||||
|
GT=3
|
||||||
|
|
||||||
|
PX=$((NX * GX ))
|
||||||
|
PY=$((NY * GY ))
|
||||||
|
PZ=$((NZ * GZ ))
|
||||||
|
PT=$((NT * GT ))
|
||||||
|
|
||||||
|
VX=$((PX * LX ))
|
||||||
|
VY=$((PY * LY ))
|
||||||
|
VZ=$((PZ * LZ ))
|
||||||
|
VT=$((PT * LT ))
|
||||||
|
|
||||||
|
NP=$((PX*PY*PZ*PT))
|
||||||
|
VOL=${VX}.${VY}.${VZ}.${VT}
|
||||||
|
AT=8
|
||||||
|
MPI=${PX}.${PY}.${PZ}.${PT}
|
||||||
|
|
||||||
|
CMD="mpiexec -np $NP -ppn 12 -envall \
|
||||||
|
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
|
||||||
|
|
||||||
|
echo VOL $VOL
|
||||||
|
echo MPI $MPI
|
||||||
|
echo NPROC $NP
|
||||||
|
echo $CMD
|
||||||
|
$CMD
|
||||||
|
|
@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
|||||||
|
|
||||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||||
@ -30,8 +30,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A
|
|||||||
|
|
||||||
if [ $PALS_RANKID = "0" ]
|
if [ $PALS_RANKID = "0" ]
|
||||||
then
|
then
|
||||||
numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
# numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||||
# numactl -p $NUMAP -N $NUMAP "$@"
|
numactl -p $NUMAP -N $NUMAP "$@"
|
||||||
else
|
else
|
||||||
numactl -p $NUMAP -N $NUMAP "$@"
|
numactl -p $NUMAP -N $NUMAP "$@"
|
||||||
fi
|
fi
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
#Ahead of time compile for PVC
|
#Ahead of time compile for PVC
|
||||||
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
|
||||||
|
|
||||||
#JIT compile
|
#JIT compile
|
||||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||||
|
|
||||||
../../configure \
|
../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-reduction=grid \
|
--enable-reduction=grid \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
--enable-debug \
|
--enable-debug \
|
||||||
|
--prefix $HOME/gpt-install \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
spack load c-lime
|
spack load c-lime
|
||||||
module load emacs
|
module load emacs
|
||||||
module load PrgEnv-gnu
|
module load PrgEnv-gnu
|
||||||
module load rocm
|
module load rocm/6.0.0
|
||||||
module load cray-mpich
|
module load cray-mpich
|
||||||
module load gmp
|
module load gmp
|
||||||
module load cray-fftw
|
module load cray-fftw
|
||||||
|
206
systems/WorkArounds.txt
Normal file
206
systems/WorkArounds.txt
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
The purpose of this file is to collate all non-obvious known magic shell variables
|
||||||
|
and compiler flags required for either correctness or performance on various systems.
|
||||||
|
|
||||||
|
A repository of work-arounds.
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
1. Interconnect + MPI
|
||||||
|
2. Compilation
|
||||||
|
3. Profiling
|
||||||
|
|
||||||
|
************************
|
||||||
|
* 1. INTERCONNECT + MPI
|
||||||
|
************************
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
|
||||||
|
--------------------------------------
|
||||||
|
ROMIO fail with > 2GB per node read (32 bit issue)
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
Use later MPICH
|
||||||
|
|
||||||
|
https://github.com/paboyle/Grid/issues/381
|
||||||
|
|
||||||
|
https://github.com/pmodels/mpich/commit/3a479ab0
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Slingshot: Frontier and Perlmutter libfabric slow down
|
||||||
|
and physical memory fragmentation
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export FI_MR_CACHE_MONITOR=disabled
|
||||||
|
or
|
||||||
|
export FI_MR_CACHE_MONITOR=kdreg2
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Perlmutter
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
export MPICH_RDMA_ENABLED_CUDA=1
|
||||||
|
export MPICH_GPU_IPC_ENABLED=1
|
||||||
|
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
|
||||||
|
export MPICH_GPU_NO_ASYNC_MEMCPY=0
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Frontier/LumiG
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
|
||||||
|
|
||||||
|
cat << EOF > select_gpu
|
||||||
|
#!/bin/bash
|
||||||
|
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||||
|
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||||
|
export GPU_MAP=(0 1 2 3 7 6 5 4)
|
||||||
|
export NUMA_MAP=(3 3 1 1 2 2 0 0)
|
||||||
|
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
|
||||||
|
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
|
||||||
|
export HIP_VISIBLE_DEVICES=\$GPU
|
||||||
|
unset ROCR_VISIBLE_DEVICES
|
||||||
|
echo RANK \$SLURM_LOCALID using GPU \$GPU
|
||||||
|
exec numactl -m \$NUMA -N \$NUMA \$*
|
||||||
|
EOF
|
||||||
|
chmod +x ./select_gpu
|
||||||
|
|
||||||
|
srun ./select_gpu BINARY
|
||||||
|
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
MPICH/Aurora/PVC correctness and performance
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
https://github.com/pmodels/mpich/issues/7302
|
||||||
|
|
||||||
|
--enable-cuda-aware-mpi=no
|
||||||
|
--enable-unified=no
|
||||||
|
|
||||||
|
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
|
||||||
|
Do not use SVM
|
||||||
|
|
||||||
|
Ideally use MPICH with fix to issue 7302:
|
||||||
|
|
||||||
|
https://github.com/pmodels/mpich/pull/7312
|
||||||
|
|
||||||
|
Ideally:
|
||||||
|
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
|
||||||
|
|
||||||
|
Alternatives:
|
||||||
|
export MPIR_CVAR_NOLOCAL=1
|
||||||
|
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
MPICH/Aurora/PVC correctness and performance
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
Broken:
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
This gives good peformance without requiring
|
||||||
|
--enable-cuda-aware-mpi=no
|
||||||
|
|
||||||
|
But is an open issue reported by James Osborn
|
||||||
|
https://github.com/pmodels/mpich/issues/7139
|
||||||
|
|
||||||
|
Possibly resolved but unclear if in the installed software yet.
|
||||||
|
|
||||||
|
************************
|
||||||
|
* 2. COMPILATION
|
||||||
|
************************
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
G++ compiler breakage / graveyard
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
9.3.0, 10.3.1,
|
||||||
|
https://github.com/paboyle/Grid/issues/290
|
||||||
|
https://github.com/paboyle/Grid/issues/264
|
||||||
|
|
||||||
|
Working (-) Broken (X):
|
||||||
|
|
||||||
|
4.9.0 -
|
||||||
|
4.9.1 -
|
||||||
|
5.1.0 X
|
||||||
|
5.2.0 X
|
||||||
|
5.3.0 X
|
||||||
|
5.4.0 X
|
||||||
|
6.1.0 X
|
||||||
|
6.2.0 X
|
||||||
|
6.3.0 -
|
||||||
|
7.1.0 -
|
||||||
|
8.0.0 (HEAD) -
|
||||||
|
|
||||||
|
https://github.com/paboyle/Grid/issues/100
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
AMD GPU nodes :
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
multiple ROCM versions broken; use 5.3.0
|
||||||
|
manifests itself as wrong results in fp32
|
||||||
|
|
||||||
|
https://github.com/paboyle/Grid/issues/464
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Aurora/PVC
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
|
||||||
|
SYCL slow link and relocatable code issues (Christoph Lehner)
|
||||||
|
Opt large register file required for good performance in fp64
|
||||||
|
|
||||||
|
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||||
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Aurora/PVC useful extra options
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
Host only sanitizer:
|
||||||
|
-Xarch_host -fsanitize=leak
|
||||||
|
-Xarch_host -fsanitize=address
|
||||||
|
|
||||||
|
Deterministic MPI reduction:
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
************************
|
||||||
|
* 3. Visual profile tools
|
||||||
|
************************
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Frontier/rocprof
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Aurora/unitrace
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Tursa/nsight-sys
|
||||||
|
--------------------------------------------------------------------
|
32
systems/sdcc-genoa/bench.slurm
Normal file
32
systems/sdcc-genoa/bench.slurm
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --partition lqcd
|
||||||
|
#SBATCH --time=00:50:00
|
||||||
|
#SBATCH -A lqcdtest
|
||||||
|
#SBATCH -q lqcd
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||||
|
#SBATCH --ntasks=1
|
||||||
|
#SBATCH --cpus-per-task=64
|
||||||
|
#SBATCH --qos lqcd
|
||||||
|
|
||||||
|
source sourceme.sh
|
||||||
|
|
||||||
|
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||||
|
export THR=(16 32 64)
|
||||||
|
|
||||||
|
for t in 2
|
||||||
|
do
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${THR[$t]}
|
||||||
|
export OMP_PLACES=${PLACES[$t]}
|
||||||
|
export thr=${THR[$t]}
|
||||||
|
|
||||||
|
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
|
||||||
|
for vol in 48.48.48.96
|
||||||
|
do
|
||||||
|
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
|
||||||
|
done
|
||||||
|
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
|
||||||
|
done
|
||||||
|
|
36
systems/sdcc-genoa/bench2.slurm
Normal file
36
systems/sdcc-genoa/bench2.slurm
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --partition lqcd
|
||||||
|
#SBATCH --time=00:50:00
|
||||||
|
#SBATCH -A lqcdtest
|
||||||
|
#SBATCH -q lqcd
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH --nodes=2
|
||||||
|
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||||
|
#SBATCH --ntasks=2
|
||||||
|
#SBATCH --cpus-per-task=64
|
||||||
|
#SBATCH --qos lqcd
|
||||||
|
|
||||||
|
source sourceme.sh
|
||||||
|
|
||||||
|
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||||
|
export THR=(16 32 64)
|
||||||
|
|
||||||
|
nodes=2
|
||||||
|
mpi=1.1.1.2
|
||||||
|
|
||||||
|
for t in 2
|
||||||
|
do
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${THR[$t]}
|
||||||
|
export OMP_PLACES=${PLACES[$t]}
|
||||||
|
export thr=${THR[$t]}
|
||||||
|
|
||||||
|
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
|
||||||
|
|
||||||
|
for vol in 64.64.64.128
|
||||||
|
do
|
||||||
|
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
16
systems/sdcc-genoa/config-command
Normal file
16
systems/sdcc-genoa/config-command
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
../../configure \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--enable-unified=yes \
|
||||||
|
--enable-shm=shmopen \
|
||||||
|
--enable-shm-fast-path=shmopen \
|
||||||
|
--enable-accelerator=none \
|
||||||
|
--enable-simd=AVX512 \
|
||||||
|
--disable-accelerator-cshift \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--disable-gparity \
|
||||||
|
CXX=clang++ \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXXFLAGS="-std=c++17"
|
||||||
|
|
||||||
|
|
||||||
|
|
4
systems/sdcc-genoa/sourceme.sh
Normal file
4
systems/sdcc-genoa/sourceme.sh
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
source $HOME/spack/share/spack/setup-env.sh
|
||||||
|
spack load llvm@17.0.4
|
||||||
|
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
|
||||||
|
module load openmpi
|
781
tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
Normal file
781
tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
Normal file
@ -0,0 +1,781 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_general_coarse_hdcg.cc
|
||||||
|
|
||||||
|
Copyright (C) 2023
|
||||||
|
|
||||||
|
Author: Peter Boyle <pboyle@bnl.gov>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||||
|
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||||
|
#include <Grid/algorithms/iterative/PowerSpectrum.h>
|
||||||
|
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
template<class aggregation>
|
||||||
|
void SaveFineEvecs(aggregation &Agg,std::string file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacWriter WR(Agg[0].Grid()->IsBoss());
|
||||||
|
WR.open(file);
|
||||||
|
for(int b=0;b<Agg.size();b++){
|
||||||
|
WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
}
|
||||||
|
WR.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class aggregation>
|
||||||
|
void SaveBasis(aggregation &Agg,std::string file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacWriter WR(Agg.FineGrid->IsBoss());
|
||||||
|
WR.open(file);
|
||||||
|
for(int b=0;b<Agg.subspace.size();b++){
|
||||||
|
WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
// WR.writeScidacFieldRecord(Agg.subspace[b],record);
|
||||||
|
}
|
||||||
|
WR.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class aggregation>
|
||||||
|
void LoadBasis(aggregation &Agg, std::string file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacReader RD ;
|
||||||
|
RD.open(file);
|
||||||
|
for(int b=0;b<Agg.subspace.size();b++){
|
||||||
|
RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class aggregation>
|
||||||
|
void LoadBasisSkip(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacReader RD ;
|
||||||
|
|
||||||
|
RD.open(file);
|
||||||
|
for(int b=0;b<Agg.subspace.size();b++){
|
||||||
|
for(int n=0;n<N;n++){
|
||||||
|
RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
if(n==0) precisionChange(Agg.subspace[b],tmp);
|
||||||
|
}
|
||||||
|
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class aggregation>
|
||||||
|
void LoadBasisSum(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacReader RD ;
|
||||||
|
|
||||||
|
LatticeFermionF sum(tmp.Grid());
|
||||||
|
RD.open(file);
|
||||||
|
for(int b=0;b<Agg.subspace.size();b++){
|
||||||
|
sum=Zero();
|
||||||
|
for(int n=0;n<N;n++){
|
||||||
|
RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
sum=sum+tmp;
|
||||||
|
}
|
||||||
|
precisionChange(Agg.subspace[b],sum);
|
||||||
|
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class CoarseVector>
|
||||||
|
void SaveEigenvectors(std::vector<RealD> &eval,
|
||||||
|
std::vector<CoarseVector> &evec,
|
||||||
|
std::string evec_file,
|
||||||
|
std::string eval_file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacWriter WR(evec[0].Grid()->IsBoss());
|
||||||
|
WR.open(evec_file);
|
||||||
|
for(int b=0;b<evec.size();b++){
|
||||||
|
WR.writeScidacFieldRecord(evec[b],record,0,0);
|
||||||
|
}
|
||||||
|
WR.close();
|
||||||
|
XmlWriter WRx(eval_file);
|
||||||
|
write(WRx,"evals",eval);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class CoarseVector>
|
||||||
|
void LoadEigenvectors(std::vector<RealD> &eval,
|
||||||
|
std::vector<CoarseVector> &evec,
|
||||||
|
std::string evec_file,
|
||||||
|
std::string eval_file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
XmlReader RDx(eval_file);
|
||||||
|
read(RDx,"evals",eval);
|
||||||
|
emptyUserRecord record;
|
||||||
|
|
||||||
|
Grid::ScidacReader RD ;
|
||||||
|
RD.open(evec_file);
|
||||||
|
assert(evec.size()==eval.size());
|
||||||
|
for(int k=0;k<eval.size();k++) {
|
||||||
|
RD.readScidacFieldRecord(evec[k],record);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Want Op in CoarsenOp to call MatPcDagMatPc
|
||||||
|
template<class Field>
|
||||||
|
class HermOpAdaptor : public LinearOperatorBase<Field>
|
||||||
|
{
|
||||||
|
LinearOperatorBase<Field> & wrapped;
|
||||||
|
public:
|
||||||
|
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
|
||||||
|
void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); }
|
||||||
|
void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); }
|
||||||
|
void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); }
|
||||||
|
void OpDiag (const Field &in, Field &out) { assert(0); }
|
||||||
|
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||||
|
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
|
||||||
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using LinearFunction<Field>::operator();
|
||||||
|
typedef LinearOperatorBase<Field> FineOperator;
|
||||||
|
FineOperator & _SmootherOperator;
|
||||||
|
ConjugateGradientPolynomial<Field> CG;
|
||||||
|
int iters;
|
||||||
|
bool record;
|
||||||
|
int replay_count;
|
||||||
|
FixedCGPolynomial(int _iters, FineOperator &SmootherOperator) :
|
||||||
|
_SmootherOperator(SmootherOperator),
|
||||||
|
iters(_iters),
|
||||||
|
record(true),
|
||||||
|
CG(0.0,_iters,false)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage<<" FixedCGPolynomial order "<<iters<<std::endl;
|
||||||
|
replay_count = 0;
|
||||||
|
};
|
||||||
|
void operator() (const Field &in, Field &out)
|
||||||
|
{
|
||||||
|
#if 1
|
||||||
|
GridBase *grid = in.Grid();
|
||||||
|
Field Mx0(grid);
|
||||||
|
Field r0(grid);
|
||||||
|
Field Minvr0(grid);
|
||||||
|
|
||||||
|
_SmootherOperator.HermOp(out,Mx0);
|
||||||
|
|
||||||
|
r0 = in - Mx0;
|
||||||
|
|
||||||
|
Minvr0 = Zero();
|
||||||
|
Minvr0.Checkerboard()=in.Checkerboard();
|
||||||
|
|
||||||
|
if ( record ) {
|
||||||
|
std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
|
||||||
|
CG.Solve(_SmootherOperator,r0,Minvr0);
|
||||||
|
record = false;
|
||||||
|
/*
|
||||||
|
std::cout << "P(x) = 0 "<<std::endl;
|
||||||
|
for(int i=0;i<CG.polynomial.size();i++){
|
||||||
|
std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
Field tmp(Minvr0.Grid());
|
||||||
|
CG.CGsequenceHermOp(_SmootherOperator,r0,tmp);
|
||||||
|
tmp = tmp - Minvr0;
|
||||||
|
std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
|
||||||
|
} else {
|
||||||
|
std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
|
||||||
|
CG.CGsequenceHermOp(_SmootherOperator,r0,Minvr0);
|
||||||
|
if ( replay_count %5== 0 ) record=true;
|
||||||
|
replay_count++;
|
||||||
|
}
|
||||||
|
out = out + Minvr0;
|
||||||
|
_SmootherOperator.HermOp(out,r0);
|
||||||
|
r0 = r0 - in;
|
||||||
|
RealD rr=norm2(r0);
|
||||||
|
RealD ss=norm2(in);
|
||||||
|
std::cout << " FixedCGPolynomial replayed polynomial resid "<<::sqrt(rr/ss)<<std::endl;
|
||||||
|
#else
|
||||||
|
out = Zero();
|
||||||
|
out.Checkerboard()=in.Checkerboard();
|
||||||
|
if ( record ) {
|
||||||
|
std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
|
||||||
|
CG.Solve(_SmootherOperator,in,out);
|
||||||
|
record = false;
|
||||||
|
std::cout << "P(x) = 0 "<<std::endl;
|
||||||
|
for(int i=0;i<CG.polynomial.size();i++){
|
||||||
|
std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
|
||||||
|
}
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
CG.CGsequenceHermOp(_SmootherOperator,in,tmp);
|
||||||
|
tmp = tmp - out;
|
||||||
|
std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
|
||||||
|
} else {
|
||||||
|
std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
|
||||||
|
CG.CGsequenceHermOp(_SmootherOperator,in,out);
|
||||||
|
if ( replay_count %5== 5 ) record=true;
|
||||||
|
replay_count++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
void operator() (const std::vector<Field> &in, std::vector<Field> &out)
|
||||||
|
{
|
||||||
|
for(int i=0;i<out.size();i++){
|
||||||
|
out[i]=Zero();
|
||||||
|
}
|
||||||
|
int blockDim = 0;//not used for BlockCGVec
|
||||||
|
BlockConjugateGradient<Field> BCGV (BlockCGrQVec,blockDim,0.0,iters,false);
|
||||||
|
BCGV(_SmootherOperator,in,out);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
template<class Field> class CGSmoother : public LinearFunction<Field>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using LinearFunction<Field>::operator();
|
||||||
|
typedef LinearOperatorBase<Field> FineOperator;
|
||||||
|
FineOperator & _SmootherOperator;
|
||||||
|
int iters;
|
||||||
|
CGSmoother(int _iters, FineOperator &SmootherOperator) :
|
||||||
|
_SmootherOperator(SmootherOperator),
|
||||||
|
iters(_iters)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
|
||||||
|
};
|
||||||
|
void operator() (const Field &in, Field &out)
|
||||||
|
{
|
||||||
|
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
|
||||||
|
|
||||||
|
out=Zero();
|
||||||
|
|
||||||
|
CG(_SmootherOperator,in,out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
RealD InverseApproximation(RealD x){
|
||||||
|
return 1.0/x;
|
||||||
|
}
|
||||||
|
template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using LinearFunction<Field>::operator();
|
||||||
|
typedef LinearOperatorBase<Field> FineOperator;
|
||||||
|
FineOperator & _SmootherOperator;
|
||||||
|
Chebyshev<Field> Cheby;
|
||||||
|
ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
|
||||||
|
_SmootherOperator(SmootherOperator),
|
||||||
|
Cheby(_lo,_hi,_ord,InverseApproximation)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
|
||||||
|
};
|
||||||
|
void operator() (const Field &in, Field &out)
|
||||||
|
{
|
||||||
|
// Field r(out.Grid());
|
||||||
|
Cheby(_SmootherOperator,in,out);
|
||||||
|
// _SmootherOperator.HermOp(out,r);
|
||||||
|
// r=r-in;
|
||||||
|
// RealD rr=norm2(r);
|
||||||
|
// RealD ss=norm2(in);
|
||||||
|
// std::cout << GridLogMessage<<" Chebyshev smoother resid "<<::sqrt(rr/ss)<<std::endl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template<class Field> class ChebyshevInverter : public LinearFunction<Field>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using LinearFunction<Field>::operator();
|
||||||
|
typedef LinearOperatorBase<Field> FineOperator;
|
||||||
|
FineOperator & _Operator;
|
||||||
|
Chebyshev<Field> Cheby;
|
||||||
|
ChebyshevInverter(RealD _lo,RealD _hi,int _ord, FineOperator &Operator) :
|
||||||
|
_Operator(Operator),
|
||||||
|
Cheby(_lo,_hi,_ord,InverseApproximation)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage<<" Chebyshev Inverter order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
|
||||||
|
};
|
||||||
|
void operator() (const Field &in, Field &out)
|
||||||
|
{
|
||||||
|
Field r(in.Grid());
|
||||||
|
Field AinvR(in.Grid());
|
||||||
|
_Operator.HermOp(out,r);
|
||||||
|
r = in - r; // b - A x
|
||||||
|
Cheby(_Operator,r,AinvR); // A^{-1} ( b - A x ) ~ A^{-1} b - x
|
||||||
|
out = out + AinvR;
|
||||||
|
_Operator.HermOp(out,r);
|
||||||
|
r = in - r; // b - A x
|
||||||
|
RealD rr = norm2(r);
|
||||||
|
RealD ss = norm2(in);
|
||||||
|
std::cout << "ChebshevInverse resid " <<::sqrt(rr/ss)<<std::endl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
int sample=1;
|
||||||
|
if( GridCmdOptionExists(argv,argv+argc,"--sample") ){
|
||||||
|
std::string arg;
|
||||||
|
arg = GridCmdOptionPayload(argv,argv+argc,"--sample");
|
||||||
|
GridCmdOptionInt(arg,sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int Ls=24;
|
||||||
|
const int nbasis = 62;
|
||||||
|
const int cb = 0 ;
|
||||||
|
RealD mass=0.00078;
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
|
||||||
|
std::string arg;
|
||||||
|
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
|
||||||
|
GridCmdOptionFloat(arg,mass);
|
||||||
|
}
|
||||||
|
|
||||||
|
RealD M5=1.8;
|
||||||
|
RealD b=1.5;
|
||||||
|
RealD c=0.5;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << " *************************** " <<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Mass " <<mass<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " M5 " <<M5<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Ls " <<Ls<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " b " <<b<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " c " <<c<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " *************************** " <<std::endl;
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Single precision grids -- lanczos + smoother
|
||||||
|
//////////////////////////////////////////
|
||||||
|
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||||
|
GridDefaultSimd(Nd,vComplexF::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
|
||||||
|
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
|
||||||
|
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
|
||||||
|
|
||||||
|
///////////////////////// Configuration /////////////////////////////////
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
|
||||||
|
FieldMetaData header;
|
||||||
|
std::string file("ckpoint_lat.1000");
|
||||||
|
NerscIO::readConfiguration(Umu,header,file);
|
||||||
|
|
||||||
|
//////////////////////// Fermion action //////////////////////////////////
|
||||||
|
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
|
||||||
|
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Fine Power method "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
{
|
||||||
|
LatticeFermionD pm_src(FrbGrid);
|
||||||
|
pm_src = ComplexD(1.0);
|
||||||
|
PowerMethod<LatticeFermionD> fPM;
|
||||||
|
fPM(HermOpEO,pm_src);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(0)
|
||||||
|
{
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Fine Lanczos "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
typedef LatticeFermionF FermionField;
|
||||||
|
LatticeGaugeFieldF UmuF(UGridF);
|
||||||
|
precisionChange(UmuF,Umu);
|
||||||
|
MobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,b,c);
|
||||||
|
SchurDiagMooeeOperator<MobiusFermionF, LatticeFermionF> HermOpEOF(DdwfF);
|
||||||
|
|
||||||
|
const int Fine_Nstop = 200;
|
||||||
|
const int Fine_Nk = 200;
|
||||||
|
const int Fine_Np = 200;
|
||||||
|
const int Fine_Nm = Fine_Nk+Fine_Np;
|
||||||
|
const int Fine_MaxIt= 10;
|
||||||
|
|
||||||
|
RealD Fine_resid = 1.0e-4;
|
||||||
|
std::cout << GridLogMessage << "Fine Lanczos "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "Nstop "<<Fine_Nstop<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "Nk "<<Fine_Nk<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "Np "<<Fine_Np<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "resid "<<Fine_resid<<std::endl;
|
||||||
|
|
||||||
|
Chebyshev<FermionField> Cheby(0.002,92.0,401);
|
||||||
|
// Chebyshev<FermionField> Cheby(0.1,92.0,401);
|
||||||
|
FunctionHermOp<FermionField> OpCheby(Cheby,HermOpEOF);
|
||||||
|
PlainHermOp<FermionField> Op (HermOpEOF);
|
||||||
|
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Fine_Nstop,Fine_Nk,Fine_Nm,Fine_resid,Fine_MaxIt);
|
||||||
|
std::vector<RealD> Fine_eval(Fine_Nm);
|
||||||
|
FermionField Fine_src(FrbGridF);
|
||||||
|
Fine_src = ComplexF(1.0);
|
||||||
|
std::vector<FermionField> Fine_evec(Fine_Nm,FrbGridF);
|
||||||
|
|
||||||
|
int Fine_Nconv;
|
||||||
|
std::cout << GridLogMessage <<" Calling IRL.calc single prec"<<std::endl;
|
||||||
|
IRL.calc(Fine_eval,Fine_evec,Fine_src,Fine_Nconv);
|
||||||
|
|
||||||
|
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
|
||||||
|
SaveFineEvecs(Fine_evec,evec_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Construct a coarsened grid with 4^4 cell
|
||||||
|
//////////////////////////////////////////
|
||||||
|
Coordinate Block({4,4,6,4});
|
||||||
|
Coordinate clatt = GridDefaultLatt();
|
||||||
|
for(int d=0;d<clatt.size();d++){
|
||||||
|
clatt[d] = clatt[d]/Block[d];
|
||||||
|
}
|
||||||
|
|
||||||
|
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());;
|
||||||
|
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
||||||
|
|
||||||
|
///////////////////////// RNGs /////////////////////////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
std::vector<int> cseeds({5,6,7,8});
|
||||||
|
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
|
||||||
|
|
||||||
|
|
||||||
|
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
|
||||||
|
HermFineMatrix FineHermOp(HermOpEO);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
///////////// Coarse basis and Little Dirac Operator ///////
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
|
||||||
|
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
||||||
|
|
||||||
|
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
|
||||||
|
|
||||||
|
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
||||||
|
Subspace Aggregates(Coarse5d,FrbGrid,cb);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Need to check about red-black grid coarsening
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
|
||||||
|
// // std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
|
||||||
|
// std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
|
||||||
|
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
|
||||||
|
std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
|
||||||
|
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
|
||||||
|
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
|
||||||
|
bool load_agg=true;
|
||||||
|
bool load_refine=true;
|
||||||
|
bool load_mat=false;
|
||||||
|
bool load_evec=false;
|
||||||
|
|
||||||
|
int refine=1;
|
||||||
|
if ( load_agg ) {
|
||||||
|
if ( !(refine) || (!load_refine) ) {
|
||||||
|
LoadBasis(Aggregates,subspace_file);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
|
||||||
|
// 0.0003,1.0e-5,2000); // Lo, tol, maxit
|
||||||
|
// Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
|
||||||
|
Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);
|
||||||
|
SaveBasis(Aggregates,subspace_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << "Building MultiRHS Coarse operator"<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
ConjugateGradient<CoarseVector> coarseCG(4.0e-2,20000,true);
|
||||||
|
|
||||||
|
const int nrhs=24;
|
||||||
|
|
||||||
|
Coordinate mpi=GridDefaultMpi();
|
||||||
|
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
|
||||||
|
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
|
||||||
|
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
|
||||||
|
|
||||||
|
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
|
||||||
|
typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
|
||||||
|
MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Coarse Lanczos "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
|
||||||
|
Chebyshev<CoarseVector> IRLCheby(0.005,42.0,301); // 1 iter
|
||||||
|
MrhsHermMatrix MrhsCoarseOp (mrhs);
|
||||||
|
|
||||||
|
// CoarseVector pm_src(CoarseMrhs);
|
||||||
|
// pm_src = ComplexD(1.0);
|
||||||
|
// PowerMethod<CoarseVector> cPM; cPM(MrhsCoarseOp,pm_src);
|
||||||
|
|
||||||
|
int Nk=192;
|
||||||
|
int Nm=384;
|
||||||
|
int Nstop=Nk;
|
||||||
|
int Nconv_test_interval=1;
|
||||||
|
|
||||||
|
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
|
||||||
|
Coarse5d,
|
||||||
|
CoarseMrhs,
|
||||||
|
nrhs,
|
||||||
|
IRLCheby,
|
||||||
|
Nstop,
|
||||||
|
Nconv_test_interval,
|
||||||
|
nrhs,
|
||||||
|
Nk,
|
||||||
|
Nm,
|
||||||
|
1e-5,10);
|
||||||
|
|
||||||
|
int Nconv;
|
||||||
|
std::vector<RealD> eval(Nm);
|
||||||
|
std::vector<CoarseVector> evec(Nm,Coarse5d);
|
||||||
|
std::vector<CoarseVector> c_src(nrhs,Coarse5d);
|
||||||
|
|
||||||
|
///////////////////////
|
||||||
|
// Deflation guesser object
|
||||||
|
///////////////////////
|
||||||
|
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Block projector for coarse/fine
|
||||||
|
//////////////////////////////////////////
|
||||||
|
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
|
// Extra HDCG parameters
|
||||||
|
//////////////////////////
|
||||||
|
int maxit=300;
|
||||||
|
ConjugateGradient<CoarseVector> CG(5.0e-2,maxit,false);
|
||||||
|
ConjugateGradient<CoarseVector> CGstart(5.0e-2,maxit,false);
|
||||||
|
RealD lo=2.0;
|
||||||
|
int ord = 7;
|
||||||
|
// int ord = 11;
|
||||||
|
|
||||||
|
int blockDim = 0;//not used for BlockCG
|
||||||
|
BlockConjugateGradient<CoarseVector> BCG (BlockCGrQ,blockDim,5.0e-5,maxit,true);
|
||||||
|
|
||||||
|
DoNothingGuesser<CoarseVector> DoNothing;
|
||||||
|
// HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
|
||||||
|
// HPDSolver<CoarseVector> HPDSolveMrhsStart(MrhsCoarseOp,CGstart,DoNothing);
|
||||||
|
// HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,BCG,DoNothing);
|
||||||
|
// HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,BCG,DoNothing);
|
||||||
|
// FixedCGPolynomial<CoarseVector> HPDSolveMrhs(maxit,MrhsCoarseOp);
|
||||||
|
|
||||||
|
ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); //
|
||||||
|
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,110,MrhsCoarseOp); // 114 iter with Chebysmooth and BlockCG
|
||||||
|
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); // 138 iter with Chebysmooth
|
||||||
|
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,200,MrhsCoarseOp); // 139 iter
|
||||||
|
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-3,40.0,200,MrhsCoarseOp); // 137 iter, CG smooth, flex
|
||||||
|
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-3,40.0,200,MrhsCoarseOp); // 146 iter, CG smooth, flex
|
||||||
|
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-4,40.0,200,MrhsCoarseOp); // 156 iter, CG smooth, flex
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Mirs smoother
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,lo);
|
||||||
|
// FixedCGPolynomial<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
|
||||||
|
// CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
|
||||||
|
ChebyshevSmoother<LatticeFermionD> CGsmooth(2.0,92.0,8,HermOpEO) ;
|
||||||
|
|
||||||
|
if ( load_refine ) {
|
||||||
|
LoadBasis(Aggregates,refine_file);
|
||||||
|
// LatticeFermionF conv_tmp(FrbGridF);
|
||||||
|
// LoadBasisSum(Aggregates,refine_file,sample,conv_tmp);
|
||||||
|
} else {
|
||||||
|
Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
|
||||||
|
SaveBasis(Aggregates,refine_file);
|
||||||
|
}
|
||||||
|
Aggregates.Orthogonalise();
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << "Coarsen after refine"<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Recompute coarse evecs "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
evec.resize(Nm,Coarse5d);
|
||||||
|
eval.resize(Nm);
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
random(CRNG,c_src[r]);
|
||||||
|
}
|
||||||
|
IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Reimport coarse evecs "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
MrhsGuesser.ImportEigenBasis(evec,eval);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Setting up mRHS HDCG"<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
|
||||||
|
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << "Calling mRHS HDCG"<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
|
||||||
|
HDCGmrhs(1.0e-8, 300,
|
||||||
|
FineHermOp,
|
||||||
|
CGsmooth,
|
||||||
|
HPDSolveMrhs, // Used in M1
|
||||||
|
HPDSolveMrhs, // Used in Vstart
|
||||||
|
MrhsProjector,
|
||||||
|
MrhsGuesser,
|
||||||
|
CoarseMrhs);
|
||||||
|
|
||||||
|
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
|
||||||
|
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
|
||||||
|
LatticeFermionD result_accurate(FrbGrid);
|
||||||
|
LatticeFermionD result_sloppy(FrbGrid);
|
||||||
|
LatticeFermionD error(FrbGrid);
|
||||||
|
LatticeFermionD residual(FrbGrid);
|
||||||
|
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
random(RNG5,src_mrhs[r]);
|
||||||
|
res_mrhs[r]=Zero();
|
||||||
|
}
|
||||||
|
HDCGmrhs(src_mrhs,res_mrhs);
|
||||||
|
result_accurate = res_mrhs[0];
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
std::vector<RealD> bins({1.0e-3,1.0e-2,1.0e-1,1.0,10.0,100.0});
|
||||||
|
std::vector<int> orders({6000 ,4000 ,1000 ,500,500 ,500});
|
||||||
|
PowerSpectrum GraphicEqualizer(bins,orders);
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum of rrr "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
GraphicEqualizer(FineHermOp,HDCGmrhs.rrr);
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum of sss "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
GraphicEqualizer(FineHermOp,HDCGmrhs.sss);
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum of qqq "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
GraphicEqualizer(FineHermOp,HDCGmrhs.qqq);
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum of zzz "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
GraphicEqualizer(FineHermOp,HDCGmrhs.zzz);
|
||||||
|
|
||||||
|
std::vector<RealD> tols({1.0e-3,1.0e-4,1.0e-5});
|
||||||
|
|
||||||
|
|
||||||
|
for(auto tol : tols) {
|
||||||
|
|
||||||
|
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
|
||||||
|
HDCGmrhsSloppy(tol, 500,
|
||||||
|
FineHermOp,
|
||||||
|
CGsmooth,
|
||||||
|
HPDSolveMrhs, // Used in M1
|
||||||
|
HPDSolveMrhs, // Used in Vstart
|
||||||
|
MrhsProjector,
|
||||||
|
MrhsGuesser,
|
||||||
|
CoarseMrhs);
|
||||||
|
|
||||||
|
// Solve again to 10^-5
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
res_mrhs[r]=Zero();
|
||||||
|
}
|
||||||
|
HDCGmrhsSloppy(src_mrhs,res_mrhs);
|
||||||
|
|
||||||
|
result_sloppy = res_mrhs[0];
|
||||||
|
error = result_sloppy - result_accurate;
|
||||||
|
FineHermOp.HermOp(result_sloppy,residual);
|
||||||
|
residual = residual - src_mrhs[0];
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Converged to tolerance "<< tol<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Absolute error "<<norm2(error)<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Residual "<<norm2(residual)<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum of error "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
GraphicEqualizer(FineHermOp,error);
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum of residual "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
GraphicEqualizer(FineHermOp,residual);
|
||||||
|
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Standard CG
|
||||||
|
#if 0
|
||||||
|
{
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << "Calling red black CG"<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
|
||||||
|
LatticeFermion result(FrbGrid); result=Zero();
|
||||||
|
LatticeFermion src(FrbGrid); random(RNG5,src);
|
||||||
|
result=Zero();
|
||||||
|
|
||||||
|
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
|
||||||
|
CGfine(HermOpEO, src, result);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
Grid_finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
355
tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
Normal file
355
tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
Normal file
@ -0,0 +1,355 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_general_coarse_hdcg.cc
|
||||||
|
|
||||||
|
Copyright (C) 2023
|
||||||
|
|
||||||
|
Author: Peter Boyle <pboyle@bnl.gov>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||||
|
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
template<class aggregation>
|
||||||
|
void SaveFineEvecs(aggregation &Agg,std::string file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacWriter WR(Agg[0].Grid()->IsBoss());
|
||||||
|
WR.open(file);
|
||||||
|
for(int b=0;b<Agg.size();b++){
|
||||||
|
WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
}
|
||||||
|
WR.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class aggregation>
|
||||||
|
void SaveBasis(aggregation &Agg,std::string file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacWriter WR(Agg.FineGrid->IsBoss());
|
||||||
|
WR.open(file);
|
||||||
|
for(int b=0;b<Agg.subspace.size();b++){
|
||||||
|
WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
// WR.writeScidacFieldRecord(Agg.subspace[b],record);
|
||||||
|
}
|
||||||
|
WR.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class aggregation>
|
||||||
|
void LoadBasis(aggregation &Agg, std::string file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacReader RD ;
|
||||||
|
RD.open(file);
|
||||||
|
for(int b=0;b<Agg.subspace.size();b++){
|
||||||
|
RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class aggregation>
|
||||||
|
void LoadFineEvecs(aggregation &Agg, std::string file,LatticeFermionF & conv_tmp)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacReader RD ;
|
||||||
|
RD.open(file);
|
||||||
|
for(int b=0;b<Agg.size();b++){
|
||||||
|
RD.readScidacFieldRecord(conv_tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
|
||||||
|
precisionChange(Agg[b],conv_tmp);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class CoarseVector>
|
||||||
|
void SaveEigenvectors(std::vector<RealD> &eval,
|
||||||
|
std::vector<CoarseVector> &evec,
|
||||||
|
std::string evec_file,
|
||||||
|
std::string eval_file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
emptyUserRecord record;
|
||||||
|
ScidacWriter WR(evec[0].Grid()->IsBoss());
|
||||||
|
WR.open(evec_file);
|
||||||
|
for(int b=0;b<evec.size();b++){
|
||||||
|
WR.writeScidacFieldRecord(evec[b],record,0,0);
|
||||||
|
}
|
||||||
|
WR.close();
|
||||||
|
XmlWriter WRx(eval_file);
|
||||||
|
write(WRx,"evals",eval);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<class CoarseVector>
|
||||||
|
void LoadEigenvectors(std::vector<RealD> &eval,
|
||||||
|
std::vector<CoarseVector> &evec,
|
||||||
|
std::string evec_file,
|
||||||
|
std::string eval_file)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
XmlReader RDx(eval_file);
|
||||||
|
read(RDx,"evals",eval);
|
||||||
|
emptyUserRecord record;
|
||||||
|
|
||||||
|
Grid::ScidacReader RD ;
|
||||||
|
RD.open(evec_file);
|
||||||
|
assert(evec.size()==eval.size());
|
||||||
|
for(int k=0;k<eval.size();k++) {
|
||||||
|
RD.readScidacFieldRecord(evec[k],record);
|
||||||
|
}
|
||||||
|
RD.close();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Want Op in CoarsenOp to call MatPcDagMatPc
|
||||||
|
template<class Field>
|
||||||
|
class HermOpAdaptor : public LinearOperatorBase<Field>
|
||||||
|
{
|
||||||
|
LinearOperatorBase<Field> & wrapped;
|
||||||
|
public:
|
||||||
|
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
|
||||||
|
void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); }
|
||||||
|
void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); }
|
||||||
|
void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); }
|
||||||
|
void OpDiag (const Field &in, Field &out) { assert(0); }
|
||||||
|
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||||
|
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
|
||||||
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<class Field> class CGSmoother : public LinearFunction<Field>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using LinearFunction<Field>::operator();
|
||||||
|
typedef LinearOperatorBase<Field> FineOperator;
|
||||||
|
FineOperator & _SmootherOperator;
|
||||||
|
int iters;
|
||||||
|
CGSmoother(int _iters, FineOperator &SmootherOperator) :
|
||||||
|
_SmootherOperator(SmootherOperator),
|
||||||
|
iters(_iters)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
|
||||||
|
};
|
||||||
|
void operator() (const Field &in, Field &out)
|
||||||
|
{
|
||||||
|
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
|
||||||
|
|
||||||
|
out=Zero();
|
||||||
|
|
||||||
|
CG(_SmootherOperator,in,out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
const int Ls=24;
|
||||||
|
const int nbasis = 62;
|
||||||
|
const int cb = 0 ;
|
||||||
|
RealD mass=0.00078;
|
||||||
|
RealD M5=1.8;
|
||||||
|
RealD b=1.5;
|
||||||
|
RealD c=0.5;
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
// Construct a coarsened grid with 4^4 cell
|
||||||
|
Coordinate Block({4,4,6,4});
|
||||||
|
Coordinate clatt = GridDefaultLatt();
|
||||||
|
for(int d=0;d<clatt.size();d++){
|
||||||
|
clatt[d] = clatt[d]/Block[d];
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Double precision grids
|
||||||
|
//////////////////////////////////////////
|
||||||
|
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());;
|
||||||
|
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Single precision grids -- lanczos + smoother
|
||||||
|
//////////////////////////////////////////
|
||||||
|
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||||
|
GridDefaultSimd(Nd,vComplexF::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
|
||||||
|
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
|
||||||
|
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
|
||||||
|
///////////////////////// RNGs /////////////////////////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
std::vector<int> cseeds({5,6,7,8});
|
||||||
|
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
|
||||||
|
|
||||||
|
///////////////////////// Configuration /////////////////////////////////
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
|
||||||
|
FieldMetaData header;
|
||||||
|
std::string file("ckpoint_lat.1000");
|
||||||
|
NerscIO::readConfiguration(Umu,header,file);
|
||||||
|
|
||||||
|
//////////////////////// Fermion action //////////////////////////////////
|
||||||
|
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
|
||||||
|
|
||||||
|
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
|
||||||
|
|
||||||
|
const int Fine_Nstop = 200;
|
||||||
|
const int Fine_Nk = 100;
|
||||||
|
const int Fine_Np = 100;
|
||||||
|
const int Fine_Nm = Fine_Nk+Fine_Np;
|
||||||
|
|
||||||
|
typedef LatticeFermion FermionField;
|
||||||
|
std::vector<RealD> Fine_eval;
|
||||||
|
std::vector<FermionField> Fine_evec;
|
||||||
|
|
||||||
|
LatticeFermionF conv_tmp(FrbGridF);
|
||||||
|
Fine_eval.resize(Fine_Nstop);
|
||||||
|
Fine_evec.resize(Fine_Nstop,FrbGrid);
|
||||||
|
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
|
||||||
|
LoadFineEvecs(Fine_evec,evec_file,conv_tmp);
|
||||||
|
|
||||||
|
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
|
||||||
|
HermFineMatrix FineHermOp(HermOpEO);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
///////////// Coarse basis and Little Dirac Operator ///////
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
|
||||||
|
typedef LittleDiracOperator::CoarseVector CoarseVector;
|
||||||
|
|
||||||
|
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
|
||||||
|
|
||||||
|
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
||||||
|
Subspace Aggregates(Coarse5d,FrbGrid,cb);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Need to check about red-black grid coarsening
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
|
||||||
|
// // std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
|
||||||
|
// std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evec");
|
||||||
|
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
|
||||||
|
// std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
|
||||||
|
// std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
|
||||||
|
// std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
|
||||||
|
bool load_agg=true;
|
||||||
|
bool load_refine=true;
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Block projector for coarse/fine
|
||||||
|
//////////////////////////////////////////
|
||||||
|
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Mirs smoother
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
int ord=8;
|
||||||
|
RealD lo=2.0;
|
||||||
|
RealD MirsShift = lo;
|
||||||
|
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
|
||||||
|
CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
|
||||||
|
|
||||||
|
LoadBasis(Aggregates,refine_file);
|
||||||
|
Aggregates.Orthogonalise();
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Using filtered subspace"<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
|
||||||
|
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||||
|
|
||||||
|
FermionField Ftmp(FrbGrid);
|
||||||
|
std::vector<FermionField> Fine_ev(1,FrbGrid);
|
||||||
|
std::vector<FermionField> Fine_ev_compressed(1,FrbGrid);
|
||||||
|
std::vector<CoarseVector> c_evec(1,Coarse5d);
|
||||||
|
for(int ev=0;ev<Fine_evec.size();ev++){
|
||||||
|
Fine_ev[0] = Fine_evec[ev];
|
||||||
|
MrhsProjector.blockProject(Fine_ev,c_evec);
|
||||||
|
MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
|
||||||
|
Ftmp = Fine_ev_compressed[0];
|
||||||
|
RealD div = 1.0/ sqrt(norm2(Ftmp));
|
||||||
|
Ftmp = Ftmp * div;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0]) <<std::endl;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" comp "<< norm2(Ftmp) <<std::endl;
|
||||||
|
Ftmp = Fine_ev[0] - Ftmp;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
|
||||||
|
CGsmooth(Fine_ev_compressed[0],Ftmp);
|
||||||
|
Ftmp = Ftmp *lo;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp) <<std::endl;
|
||||||
|
div = 1.0/ sqrt(norm2(Ftmp));
|
||||||
|
Ftmp=Ftmp*div;
|
||||||
|
Ftmp = Fine_ev[0]-Ftmp;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
std::cout << " Using eigenvector subspace "<<std::endl;
|
||||||
|
std::cout << "**************************************"<<std::endl;
|
||||||
|
for(int i=0;i<Aggregates.subspace.size();i++){
|
||||||
|
Aggregates.subspace[i] = Fine_evec[i];
|
||||||
|
}
|
||||||
|
Aggregates.Orthogonalise();
|
||||||
|
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||||
|
for(int ev=0;ev<Fine_evec.size();ev++){
|
||||||
|
Fine_ev[0] = Fine_evec[ev];
|
||||||
|
MrhsProjector.blockProject(Fine_ev,c_evec);
|
||||||
|
MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
|
||||||
|
Ftmp = Fine_ev_compressed[0];
|
||||||
|
RealD div = 1.0/ sqrt(norm2(Ftmp));
|
||||||
|
Ftmp = Ftmp * div;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0]) <<std::endl;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" comp "<< norm2(Ftmp) <<std::endl;
|
||||||
|
Ftmp = Fine_ev[0] - Ftmp;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
|
||||||
|
CGsmooth(Fine_ev_compressed[0],Ftmp);
|
||||||
|
Ftmp = Ftmp *lo;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp) <<std::endl;
|
||||||
|
div = 1.0/ sqrt(norm2(Ftmp));
|
||||||
|
Ftmp=Ftmp*div;
|
||||||
|
Ftmp = Fine_ev[0]-Ftmp;
|
||||||
|
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standard CG
|
||||||
|
Grid_finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
@ -36,28 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
class HermOpAdaptor : public LinearOperatorBase<Field>
|
|
||||||
{
|
|
||||||
LinearOperatorBase<Field> & wrapped;
|
|
||||||
public:
|
|
||||||
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
|
|
||||||
void OpDiag (const Field &in, Field &out) { assert(0); }
|
|
||||||
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
|
||||||
void Op (const Field &in, Field &out){
|
|
||||||
wrapped.HermOp(in,out);
|
|
||||||
}
|
|
||||||
void AdjOp (const Field &in, Field &out){
|
|
||||||
wrapped.HermOp(in,out);
|
|
||||||
}
|
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
|
||||||
void HermOp(const Field &in, Field &out){
|
|
||||||
wrapped.HermOp(in,out);
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class Matrix,class Field>
|
template<class Matrix,class Field>
|
||||||
class PVdagMLinearOperator : public LinearOperatorBase<Field> {
|
class PVdagMLinearOperator : public LinearOperatorBase<Field> {
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
@ -69,78 +47,169 @@ public:
|
|||||||
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
||||||
void Op (const Field &in, Field &out){
|
void Op (const Field &in, Field &out){
|
||||||
|
std::cout << "Op: PVdag M "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_Mat.M(in,tmp);
|
_Mat.M(in,tmp);
|
||||||
_PV.Mdag(tmp,out);
|
_PV.Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
void AdjOp (const Field &in, Field &out){
|
void AdjOp (const Field &in, Field &out){
|
||||||
|
std::cout << "AdjOp: Mdag PV "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_PV.M(tmp,out);
|
_PV.M(in,tmp);
|
||||||
_Mat.Mdag(in,tmp);
|
_Mat.Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||||
void HermOp(const Field &in, Field &out){
|
void HermOp(const Field &in, Field &out){
|
||||||
std::cout << "HermOp"<<std::endl;
|
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
// _Mat.M(in,tmp);
|
||||||
|
// _PV.Mdag(tmp,out);
|
||||||
|
// _PV.M(out,tmp);
|
||||||
|
// _Mat.Mdag(tmp,out);
|
||||||
|
Op(in,tmp);
|
||||||
|
AdjOp(tmp,out);
|
||||||
|
// std::cout << "HermOp done "<<norm2(out)<<std::endl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class Matrix,class Field>
|
||||||
|
class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
|
||||||
|
Matrix &_Mat;
|
||||||
|
Matrix &_PV;
|
||||||
|
RealD shift;
|
||||||
|
public:
|
||||||
|
ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
|
||||||
|
|
||||||
|
void OpDiag (const Field &in, Field &out) { assert(0); }
|
||||||
|
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||||
|
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
||||||
|
void Op (const Field &in, Field &out){
|
||||||
|
std::cout << "Op: PVdag M "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_Mat.M(in,tmp);
|
_Mat.M(in,tmp);
|
||||||
_PV.Mdag(tmp,out);
|
_PV.Mdag(tmp,out);
|
||||||
_PV.M(out,tmp);
|
out = out + shift * in;
|
||||||
_Mat.Mdag(tmp,out);
|
|
||||||
std::cout << "HermOp done "<<norm2(out)<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class Field> class DumbOperator : public LinearOperatorBase<Field> {
|
|
||||||
public:
|
|
||||||
LatticeComplex scale;
|
|
||||||
DumbOperator(GridBase *grid) : scale(grid)
|
|
||||||
{
|
|
||||||
scale = 0.0;
|
|
||||||
LatticeComplex scalesft(grid);
|
|
||||||
LatticeComplex scaletmp(grid);
|
|
||||||
for(int d=0;d<4;d++){
|
|
||||||
Lattice<iScalar<vInteger> > x(grid); LatticeCoordinate(x,d+1);
|
|
||||||
LatticeCoordinate(scaletmp,d+1);
|
|
||||||
scalesft = Cshift(scaletmp,d+1,1);
|
|
||||||
scale = 100.0*scale + where( mod(x ,2)==(Integer)0, scalesft,scaletmp);
|
|
||||||
}
|
|
||||||
std::cout << " scale\n" << scale << std::endl;
|
|
||||||
}
|
|
||||||
// Support for coarsening to a multigrid
|
|
||||||
void OpDiag (const Field &in, Field &out) {};
|
|
||||||
void OpDir (const Field &in, Field &out,int dir,int disp){};
|
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out) {};
|
|
||||||
|
|
||||||
void Op (const Field &in, Field &out){
|
|
||||||
out = scale * in;
|
|
||||||
}
|
}
|
||||||
void AdjOp (const Field &in, Field &out){
|
void AdjOp (const Field &in, Field &out){
|
||||||
out = scale * in;
|
std::cout << "AdjOp: Mdag PV "<<std::endl;
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
_PV.M(tmp,out);
|
||||||
|
_Mat.Mdag(in,tmp);
|
||||||
|
out = out + shift * in;
|
||||||
}
|
}
|
||||||
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||||
void HermOp(const Field &in, Field &out){
|
void HermOp(const Field &in, Field &out){
|
||||||
double n1, n2;
|
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
|
||||||
HermOpAndNorm(in,out,n1,n2);
|
Field tmp(in.Grid());
|
||||||
}
|
Op(in,tmp);
|
||||||
void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
|
AdjOp(tmp,out);
|
||||||
ComplexD dot;
|
|
||||||
|
|
||||||
out = scale * in;
|
|
||||||
|
|
||||||
dot= innerProduct(in,out);
|
|
||||||
n1=real(dot);
|
|
||||||
|
|
||||||
dot = innerProduct(out,out);
|
|
||||||
n2=real(dot);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
|
class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
|
||||||
|
public:
|
||||||
|
using LinearFunction<Lattice<Fobj> >::operator();
|
||||||
|
|
||||||
|
typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
|
||||||
|
typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField FineField;
|
||||||
|
typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
|
||||||
|
typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
|
||||||
|
typedef LinearOperatorBase<FineField> FineOperator;
|
||||||
|
typedef LinearFunction <FineField> FineSmoother;
|
||||||
|
typedef LinearOperatorBase<CoarseVector> CoarseOperator;
|
||||||
|
typedef LinearFunction <CoarseVector> CoarseSolver;
|
||||||
|
Aggregates & _Aggregates;
|
||||||
|
FineOperator & _FineOperator;
|
||||||
|
FineSmoother & _PreSmoother;
|
||||||
|
FineSmoother & _PostSmoother;
|
||||||
|
CoarseOperator & _CoarseOperator;
|
||||||
|
CoarseSolver & _CoarseSolve;
|
||||||
|
|
||||||
|
int level; void Level(int lv) {level = lv; };
|
||||||
|
|
||||||
|
MGPreconditioner(Aggregates &Agg,
|
||||||
|
FineOperator &Fine,
|
||||||
|
FineSmoother &PreSmoother,
|
||||||
|
FineSmoother &PostSmoother,
|
||||||
|
CoarseOperator &CoarseOperator_,
|
||||||
|
CoarseSolver &CoarseSolve_)
|
||||||
|
: _Aggregates(Agg),
|
||||||
|
_FineOperator(Fine),
|
||||||
|
_PreSmoother(PreSmoother),
|
||||||
|
_PostSmoother(PostSmoother),
|
||||||
|
_CoarseOperator(CoarseOperator_),
|
||||||
|
_CoarseSolve(CoarseSolve_),
|
||||||
|
level(1) { }
|
||||||
|
|
||||||
|
virtual void operator()(const FineField &in, FineField & out)
|
||||||
|
{
|
||||||
|
GridBase *CoarseGrid = _Aggregates.CoarseGrid;
|
||||||
|
// auto CoarseGrid = _CoarseOperator.Grid();
|
||||||
|
CoarseVector Csrc(CoarseGrid);
|
||||||
|
CoarseVector Csol(CoarseGrid);
|
||||||
|
FineField vec1(in.Grid());
|
||||||
|
FineField vec2(in.Grid());
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
|
||||||
|
|
||||||
|
// std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
|
||||||
|
double t;
|
||||||
|
// Fine Smoother
|
||||||
|
// out = in;
|
||||||
|
out = Zero();
|
||||||
|
t=-usecond();
|
||||||
|
_PreSmoother(in,out);
|
||||||
|
t+=usecond();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
|
||||||
|
|
||||||
|
// Update the residual
|
||||||
|
_FineOperator.Op(out,vec1); sub(vec1, in ,vec1);
|
||||||
|
// std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
|
||||||
|
|
||||||
|
// Fine to Coarse
|
||||||
|
t=-usecond();
|
||||||
|
_Aggregates.ProjectToSubspace (Csrc,vec1);
|
||||||
|
t+=usecond();
|
||||||
|
std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
|
||||||
|
|
||||||
|
// Coarse correction
|
||||||
|
t=-usecond();
|
||||||
|
Csol = Zero();
|
||||||
|
_CoarseSolve(Csrc,Csol);
|
||||||
|
//Csol=Zero();
|
||||||
|
t+=usecond();
|
||||||
|
std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
|
||||||
|
|
||||||
|
// Coarse to Fine
|
||||||
|
t=-usecond();
|
||||||
|
// _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
|
||||||
|
_Aggregates.PromoteFromSubspace(Csol,vec1);
|
||||||
|
add(out,out,vec1);
|
||||||
|
t+=usecond();
|
||||||
|
std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
|
||||||
|
|
||||||
|
// Residual
|
||||||
|
_FineOperator.Op(out,vec1); sub(vec1 ,in , vec1);
|
||||||
|
// std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
|
||||||
|
|
||||||
|
// Fine Smoother
|
||||||
|
t=-usecond();
|
||||||
|
// vec2=vec1;
|
||||||
|
vec2=Zero();
|
||||||
|
_PostSmoother(vec1,vec2);
|
||||||
|
t+=usecond();
|
||||||
|
std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
|
||||||
|
|
||||||
|
add( out,out,vec2);
|
||||||
|
std::cout<<GridLogMessage << "Done " <<std::endl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
const int Ls=2;
|
const int Ls=16;
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
@ -151,7 +220,8 @@ int main (int argc, char ** argv)
|
|||||||
// Construct a coarsened grid
|
// Construct a coarsened grid
|
||||||
Coordinate clatt = GridDefaultLatt();
|
Coordinate clatt = GridDefaultLatt();
|
||||||
for(int d=0;d<clatt.size();d++){
|
for(int d=0;d<clatt.size();d++){
|
||||||
clatt[d] = clatt[d]/4;
|
clatt[d] = clatt[d]/2;
|
||||||
|
// clatt[d] = clatt[d]/4;
|
||||||
}
|
}
|
||||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
|
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
|
||||||
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
||||||
@ -173,15 +243,14 @@ int main (int argc, char ** argv)
|
|||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
std::string file("ckpoint_lat.4000");
|
std::string file("ckpoint_lat.4000");
|
||||||
NerscIO::readConfiguration(Umu,header,file);
|
NerscIO::readConfiguration(Umu,header,file);
|
||||||
//Umu = 1.0;
|
|
||||||
|
|
||||||
RealD mass=0.5;
|
RealD mass=0.01;
|
||||||
RealD M5=1.8;
|
RealD M5=1.8;
|
||||||
|
|
||||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
|
DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
|
||||||
|
|
||||||
const int nbasis = 1;
|
const int nbasis = 20;
|
||||||
const int cb = 0 ;
|
const int cb = 0 ;
|
||||||
LatticeFermion prom(FGrid);
|
LatticeFermion prom(FGrid);
|
||||||
|
|
||||||
@ -194,24 +263,50 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
|
||||||
PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM(Ddwf,Dpv);
|
typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
|
||||||
HermOpAdaptor<LatticeFermionD> HOA(PVdagM);
|
typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
|
||||||
|
PVdagM_t PVdagM(Ddwf,Dpv);
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ...
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ... with 24 vecs
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ... with 24 vecs and 2^4 blocking
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ... with 16 vecs and 2^4 blocking, sloppier
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35 ... with 20 vecs and 2^4 blocking
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35 ... with 20 vecs and 2^4 blocking, looser coarse
|
||||||
|
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64 ... with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
|
||||||
|
ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); //
|
||||||
|
|
||||||
|
|
||||||
// Run power method on HOA??
|
// Run power method on HOA??
|
||||||
PowerMethod<LatticeFermion> PM; PM(HOA,src);
|
// PowerMethod<LatticeFermion> PM; PM(PVdagM,src);
|
||||||
|
|
||||||
// Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
|
// Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
|
||||||
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
||||||
Subspace AggregatesPD(Coarse5d,FGrid,cb);
|
Subspace AggregatesPD(Coarse5d,FGrid,cb);
|
||||||
|
/*
|
||||||
AggregatesPD.CreateSubspaceChebyshev(RNG5,
|
AggregatesPD.CreateSubspaceChebyshev(RNG5,
|
||||||
HOA,
|
PVdagM,
|
||||||
nbasis,
|
nbasis,
|
||||||
5000.0,
|
4000.0,
|
||||||
0.02,
|
2.0,
|
||||||
100,
|
200,
|
||||||
50,
|
200,
|
||||||
50,
|
200,
|
||||||
0.0);
|
0.0);
|
||||||
|
*/
|
||||||
|
AggregatesPD.CreateSubspaceGCR(RNG5,
|
||||||
|
PVdagM,
|
||||||
|
nbasis);
|
||||||
|
|
||||||
LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
|
LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
|
||||||
LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
|
LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
|
||||||
@ -257,6 +352,60 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
|
std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
|
||||||
// std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
|
// std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
/**********
|
||||||
|
* Some solvers
|
||||||
|
**********
|
||||||
|
*/
|
||||||
|
|
||||||
|
///////////////////////////////////////
|
||||||
|
// Coarse grid solver test
|
||||||
|
///////////////////////////////////////
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"******************* "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"******************* "<<std::endl;
|
||||||
|
TrivialPrecon<CoarseVector> simple;
|
||||||
|
NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
|
||||||
|
// PrecGeneralisedConjugateResidualNonHermitian<CoarseVector> L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10);
|
||||||
|
PrecGeneralisedConjugateResidualNonHermitian<CoarseVector> L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10);
|
||||||
|
L2PGCR.Level(3);
|
||||||
|
c_res=Zero();
|
||||||
|
L2PGCR(c_src,c_res);
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Fine grid smoother
|
||||||
|
////////////////////////////////////////
|
||||||
|
std::cout<<GridLogMessage<<"******************* "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"******************* "<<std::endl;
|
||||||
|
TrivialPrecon<LatticeFermionD> simple_fine;
|
||||||
|
// NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
|
||||||
|
PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
|
||||||
|
SmootherGCR.Level(2);
|
||||||
|
|
||||||
|
LatticeFermionD f_src(FGrid);
|
||||||
|
LatticeFermionD f_res(FGrid);
|
||||||
|
|
||||||
|
f_src = one; // 1 in every element for vector 1.
|
||||||
|
f_res=Zero();
|
||||||
|
SmootherGCR(f_src,f_res);
|
||||||
|
|
||||||
|
typedef MGPreconditioner<vSpinColourVector, vTComplex,nbasis> TwoLevelMG;
|
||||||
|
|
||||||
|
TwoLevelMG TwoLevelPrecon(AggregatesPD,
|
||||||
|
PVdagM,
|
||||||
|
simple_fine,
|
||||||
|
SmootherGCR,
|
||||||
|
LinOpCoarse,
|
||||||
|
L2PGCR);
|
||||||
|
|
||||||
|
PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
|
||||||
|
L1PGCR.Level(1);
|
||||||
|
|
||||||
|
f_res=Zero();
|
||||||
|
L1PGCR(f_src,f_res);
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
Reference in New Issue
Block a user